コード例 #1
0
def run(settings):
    model = pick_model(settings.model)()
    X_train, X_valid, Y_train, Y_valid = load_dataset(settings.data)
    print('Training the model')
    model.train(X_train, Y_train)
    print('Prediction accuracy on training set: {}'.format(
        model.test(X_train, Y_train)))
    print('Prediction accuracy on validation set: {}'.format(
        model.test(X_valid, Y_valid)))

    save_dir = 'saves/{}'.format(settings.name)
    save_file = '{}/save.model'.format(save_dir)
    os.makedirs(save_dir, exist_ok=True)
    model.save(save_file)
    print('Saved model in {}'.format(save_file))
コード例 #2
0
ファイル: build.py プロジェクト: corgis-edu/corgis
def build(dataset_name: str, format: str, config: Config) -> List[BuildReport]:
    """
    Loads the desired dataset by its name in the specified format.

    Args:
        dataset_name (str): The name of the dataset (e.g., "police-shootings").
        format (str): The desired output format, e.g., "python" or "json"
        config (Config): The configuration information
    Returns:
        List[BuildReport]: Information about the built datasets
    """
    formats = parse_format(format)
    if dataset_name == "*":
        dataset_names = get_all_datasets()
    else:
        dataset_names = [dataset_name]
    for dataset_name in dataset_names:
        dataset = load_dataset(dataset_name)
        for format in formats:
            yield build_dataset(dataset, format, config)
コード例 #3
0
        # end if
    # end for
    return base_w


# end create_matrices

####################################################
# Main
####################################################

# Parse args
args, use_cuda, param_space, xp = argument_parsing.parser_esn_training()

# Load from directory
reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset(
)

# Print authors
xp.write(u"Authors : {}".format(reutersc50_dataset.authors), log_level=0)

# Last space
last_space = dict()

# Create W matrices
base_w = create_matrices(int(args.get_space()['reservoir_size'][-1]),
                         float(args.get_space()['w_sparsity'][-1]),
                         int(args.get_space()['n_layers'][-1]))

# Iterate
for space in param_space:
    # Params
コード例 #4
0
    # end if

    return esn_wv, esn_c3


# end create_esn_models

####################################################
# Main function
####################################################

# Parse args
args, use_cuda, param_space, xp = argument_parsing.parser_esn_training()

# Load from directory
sfgram_dataset_wv, sfgram_loader_train_wv, sfgram_loader_test_wv = dataset.load_dataset(
    args.author, 'wv')
sfgram_dataset_c3, sfgram_loader_train_c3, sfgram_loader_test_c3 = dataset.load_dataset(
    args.author, 'c3')

# Print authors
xp.write(u"Author : {}".format(sfgram_dataset_wv.author), log_level=0)
xp.write(u"Texts : {}".format(len(sfgram_dataset_wv.texts)), log_level=0)

# W index
w_index = 0

# Last space
last_space = dict()

# Iterate
for space in param_space:
コード例 #5
0
    """
    return confusion_matrix[0, 0] / (confusion_matrix[0, 0] +
                                     confusion_matrix[0, 1])


# end compute_accuracy

####################################################
# Main function
####################################################

# Parse args
args, use_cuda, param_space, xp = argument_parsing.parser_esn_training()

# Load from directory
sfgram_dataset, sfgram_loader_train, sfgram_loader_test = dataset.load_dataset(
    args.author, args.transformer[0][0][0])

# Print authors
xp.write(u"Author : {}".format(sfgram_dataset.author), log_level=0)
xp.write(u"Texts : {}".format(len(sfgram_dataset.texts)), log_level=0)

# W index
w_index = 0

# Last space
last_space = dict()

# Iterate
for space in param_space:
    # Params
    reservoir_size, w_sparsity, leak_rate, input_scaling, \
コード例 #6
0
import torch.utils.data
from torch.autograd import Variable
import echotorch.nn as etnn
import echotorch.utils
from tools import argument_parsing, dataset, functions, features
import matplotlib.pyplot as plt

####################################################
# Main
####################################################

# Parse args
args, use_cuda, param_space, xp = argument_parsing.parser_esn_training()

# Load from directory
reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset(
    args.dataset_size, sentence_level=True)

# Print authors
xp.write(u"Authors : {}".format(reutersc50_dataset.authors), log_level=0)

# First params
w = functions.manage_w(xp, args, args.keep_w)

# W index
w_index = 0

# Last space
last_space = dict()

# Iterate
for space in param_space:
コード例 #7
0
import echotorch.utils
from tools import argument_parsing, dataset, functions, features
import matplotlib.pyplot as plt
import torch.nn.functional as F


####################################################
# Main
####################################################


# Parse args
args, use_cuda, param_space, xp = argument_parsing.parser_esn_training()

# Load from directory
reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset(args.dataset_size, shuffle=False)

# Print authors
xp.write(u"Authors : {}".format(reutersc50_dataset.authors), log_level=0)

# First params
w = functions.manage_w(xp, args, args.keep_w)

# W index
w_index = 0

# Last space
last_space = dict()

# Iterate
for space in param_space:
コード例 #8
0
def train_ccsaa(fold=0,
                ccsaa_epoch=100,
                text_length=20,
                n_gram='c1',
                dataset_size=100,
                dataset_start=0,
                cuda=True,
                save=False,
                save_dir='.'):
    """
    Train CCSAA
    :param fold:
    :param ccsaa_epoch:
    :param text_length:
    :param n_gram:
    :param dataset_size:
    :param dataset_start:
    :param cuda:
    :return:
    """
    # Save path
    save_path = os.path.join(save_dir, str(int(dataset_size)),
                             str(int(dataset_start)))

    # Transforms
    if n_gram == 'c1':
        transform = transforms.Compose([
            transforms.Character(),
            transforms.ToIndex(start_ix=0),
            transforms.ToNGram(n=text_length, overlapse=True),
            transforms.Reshape((-1, text_length))
        ])
    else:
        transform = transforms.Compose([
            transforms.Character2Gram(),
            transforms.ToIndex(start_ix=0),
            transforms.ToNGram(n=text_length, overlapse=True),
            transforms.Reshape((-1, text_length))
        ])
    # end if

    # Load from directory
    reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset(
        dataset_size=dataset_size, dataset_start=dataset_start)
    reutersc50_dataset.transform = transform

    # Loss function
    loss_function = nn.CrossEntropyLoss()

    # Set fold
    reuters_loader_train.dataset.set_fold(fold)
    reuters_loader_test.dataset.set_fold(fold)

    # Model
    model = torchlanguage.models.CCSAA(
        text_length=text_length,
        vocab_size=settings.ccsaa_voc_size,
        embedding_dim=settings.ccsaa_embedding_dim,
        n_classes=settings.n_authors)
    if cuda:
        model.cuda()
    # end if

    # Load
    if save and os.path.exists(
            os.path.join(save_path,
                         u"ccsaa." + str(fold) + u".pth")) and os.path.exists(
                             os.path.join(save_path, u"ccsaa." + str(fold) +
                                          u".voc.pth")):
        model.load_state_dict(
            torch.load(
                open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"),
                     'rb')))
        voc = torch.load(
            open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"),
                 'rb'))
        return model, voc
    # end if

    # Optimizer
    optimizer = optim.SGD(model.parameters(),
                          lr=settings.ccsaa_lr,
                          momentum=settings.ccsaa_momentum)

    # Best model
    best_acc = 0.0
    best_model = model.state_dict()

    # Fail count
    fail_count = 0

    # Epoch
    for epoch in range(10000):
        # Total losses
        training_loss = 0.0
        training_total = 0.0
        test_loss = 0.0
        test_total = 0.0

        # Get test data for this fold
        for i, data in enumerate(reuters_loader_train):
            # Inputs and labels
            inputs, labels, time_labels = data

            # Reshape
            inputs = inputs.view(-1, text_length)

            # Outputs
            outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0])

            # To variable
            inputs, outputs = Variable(inputs), Variable(outputs)
            if cuda:
                inputs, outputs = inputs.cuda(), outputs.cuda()
            # end if

            # Zero grad
            model.zero_grad()

            # Compute output
            log_probs = model(inputs)

            # Loss
            loss = loss_function(log_probs, outputs)

            # Backward and step
            loss.backward()
            optimizer.step()

            # Add
            training_loss += loss.data[0]
            training_total += 1.0
        # end for

        # Counters
        total = 0.0
        success = 0.0

        # For each test sample
        for i, data in enumerate(reuters_loader_test):
            # Inputs and labels
            inputs, labels, time_labels = data

            # Reshape
            inputs = inputs.view(-1, text_length)

            # Outputs
            outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0])

            # To variable
            inputs, outputs = Variable(inputs), Variable(outputs)
            if cuda:
                inputs, outputs = inputs.cuda(), outputs.cuda()
            # end if

            # Forward
            model_outputs = model(inputs)
            loss = loss_function(model_outputs, outputs)

            # Take the max as predicted
            _, predicted = torch.max(model_outputs.data, 1)

            # Add to correctly classified word
            success += (predicted == outputs.data).sum()
            total += predicted.size(0)

            # Add loss
            test_loss += loss.data[0]
            test_total += 1.0
        # end for

        # Accuracy
        accuracy = success / total * 100.0
        # print(u"Epoch {}, train loss {}, test loss {}, accuracy {}".format(epoch, training_loss / training_total, test_loss / test_total, accuracy))

        # Save if best
        if accuracy > best_acc and epoch > 10:
            best_acc = accuracy
            best_model = model.state_dict()
            fail_count = 0
        elif epoch > 10:
            fail_count += 1
        # end if

        if fail_count > ccsaa_epoch:
            break
        # end if
    # end for

    # Load best
    model.load_state_dict(best_model)

    # Save
    if save:
        # Create dir if not exists
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        # end if

        # Save
        torch.save(
            model.state_dict(),
            open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"),
                 'wb'))

        # Save doc
        torch.save(
            transform.transforms[1].token_to_ix,
            open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"),
                 'wb'))
    # end if

    return model, transform.transforms[1].token_to_ix
コード例 #9
0
def train_cgfs(fold=0,
               cgfs_epoch=100,
               n_gram='c3',
               dataset_size=100,
               dataset_start=0,
               cuda=True,
               save=False,
               save_dir='.'):
    """
    Train a CGFS selector
    :param fold:
    :param cgfs_epoch:
    :param n_gram:
    :param dataset_size:
    :param dataset_start:
    :param cuda:
    :return:
    """
    # Global
    global stats_sum, stats_sd

    # Save path
    save_path = os.path.join(save_dir, str(int(dataset_size)),
                             str(int(dataset_start)))

    # Word embedding
    transform = torchlanguage.transforms.Compose([
        torchlanguage.transforms.GloveVector(model='en_vectors_web_lg'),
        torchlanguage.transforms.ToNGram(n=3),
        torchlanguage.transforms.Reshape((-1, 3, settings.glove_embedding_dim))
    ])

    # Load from directory
    reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset(
        dataset_size=dataset_size, dataset_start=dataset_start)
    reutersc50_dataset.transform = transform

    # Loss function
    loss_function = nn.NLLLoss()

    # Set fold
    reuters_loader_train.dataset.set_fold(fold)
    reuters_loader_test.dataset.set_fold(fold)

    # Model
    model = torchlanguage.models.CGFS(
        n_gram=3,
        n_authors=settings.n_authors,
        n_features=settings.cgfs_output_dim[n_gram])
    if cuda:
        model.cuda()
    # end if

    # FS file
    fs_file = os.path.join(save_path, u"cgfs." + str(fold) + u".p")

    # Load
    if save and os.path.exists(fs_file):
        print(fs_file)
        model.load_state_dict(torch.load(open(fs_file, 'rb')))
        return model
    # end if

    # Best model
    best_acc = 0.0
    best_model = model.state_dict()

    # Optimizer
    optimizer = optim.SGD(model.parameters(),
                          lr=settings.cgfs_lr,
                          momentum=settings.cgfs_momentum)

    # Fail count
    fail_count = 0

    # Epoch
    for epoch in range(10000):
        # Total losses
        training_loss = 0.0
        training_total = 0.0
        test_loss = 0.0
        test_total = 0.0

        # Get test data for this fold
        for i, data in enumerate(reuters_loader_train):
            # Inputs and labels
            inputs, labels, time_labels = data

            # View
            inputs = inputs.view((-1, 1, 3, settings.glove_embedding_dim))

            # Outputs
            outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0])

            # To variable
            inputs, outputs = Variable(inputs), Variable(outputs)
            if cuda:
                inputs, outputs = inputs.cuda(), outputs.cuda()
            # end if

            # Zero grad
            model.zero_grad()

            # Compute output
            log_probs = model(inputs)

            # Loss
            loss = loss_function(log_probs, outputs)

            # Backward and step
            loss.backward()
            optimizer.step()

            # Add
            training_loss += loss.data[0]
            training_total += 1.0
        # end for

        # Counters
        total = 0.0
        success = 0.0
        doc_total = 0.0
        doc_success = 0.0

        # Statistics
        stats_sum = 0.0
        stats_sd = 0.0

        # For each test sample
        for i, data in enumerate(reuters_loader_test):
            # Inputs and labels
            inputs, labels, time_labels = data

            # View
            inputs = inputs.view((-1, 1, 3, settings.glove_embedding_dim))

            # Outputs
            outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0])

            # To variable
            inputs, outputs, labels = Variable(inputs), Variable(
                outputs), Variable(labels)
            if cuda:
                inputs, outputs, labels = inputs.cuda(), outputs.cuda(
                ), labels.cuda()
            # end if

            # Forward
            model_outputs = model(inputs)
            loss = loss_function(model_outputs, outputs)

            # Take the max as predicted
            _, predicted = torch.max(model_outputs.data, 1)

            # Add to correctly classified word
            success += (predicted == outputs.data).sum()
            total += predicted.size(0)

            # Add loss
            test_loss += loss.data[0]
            test_total += 1.0

            # Normalized
            y_predicted = echotorch.utils.max_average_through_time(
                model_outputs, dim=0)

            # Compare
            if torch.equal(y_predicted, labels):
                doc_success += 1.0
            # end if
            doc_total += 1.0
        # end for

        # Accuracy
        accuracy = success / total * 100.0
        doc_accuracy = doc_success / doc_total * 100.0

        # Print and save loss
        print(
            u"Epoch {}, training total {}, train loss {}, test total {}, test loss {}, accuracy {}, doc accuracy {} (mean {}. std {})"
            .format(epoch, training_total, training_loss, test_total,
                    test_loss, accuracy, doc_accuracy, stats_sum / test_total,
                    stats_sd / test_total))

        # Save if best
        if accuracy > best_acc and epoch > 10:
            best_acc = accuracy
            best_model = model.state_dict()
            fail_count = 0
        elif epoch > 10:
            fail_count += 1
        # end if

        # Fail
        if fail_count > cgfs_epoch:
            break
        # end if
    # end for

    # Load best
    model.load_state_dict(best_model)

    # Save
    if save:
        # Create dir if not exists
        if not os.path.exists(save_path):
            os.mkdir(save_path)
        # end if

        # Save
        torch.save(
            model.state_dict(),
            open(os.path.join(save_path, u"cgfs." + str(fold) + u".pth"),
                 'wb'))

        # Save info
        torch.save({
            'mean': stats_sum,
            'std': stats_sd
        },
                   open(
                       os.path.join(save_path,
                                    u"cgfs.info." + str(fold) + u".pth"),
                       'wb'))
    # end if

    return model, {'mean': stats_sum, 'std': stats_sd}
コード例 #10
0
args.add_argument(command="--name", name="name", type=str, help="Experiment's name", extended=False, required=True)
args.add_argument(command="--description", name="description", type=str, help="Experiment's description",
                  extended=False, required=True)
args.add_argument(command="--output", name="output", type=str, help="Experiment's output directory", required=True,
                  extended=False)
args.add_argument(command="--n-samples", name="n_samples", type=int, help="Number of different reservoir to test",
                  default=1, extended=False)
args.add_argument(command="--verbose", name="verbose", type=int, help="Verbose level", default=2, extended=False)

# Parse arguments
args.parse()

# Load from directory
reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset(
    args.dataset_size,
    sentence_level=False,
    n_authors=args.n_authors
)

# Dataset start
reutersc50_dataset.set_start(0)

# Experiment
xp = nsNLP.tools.ResultManager\
(
    args.output,
    args.name,
    args.description,
    args.get_space(),
    1,
    args.k,
コード例 #11
0
import numpy as np
import torch.utils.data
from torch.autograd import Variable
import echotorch.nn as etnn
import echotorch.utils
from tools import argument_parsing, dataset, functions, features, cenc_selector, settings

####################################################
# Main
####################################################

# Parse args
args, use_cuda, param_space, xp = argument_parsing.parser_esn_training()

# Load from directory
reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset(
    args.dataset_size)

# Print authors
xp.write(u"Authors : {}".format(reutersc50_dataset.authors), log_level=0)

# First params
w = functions.manage_w(xp, args, args.keep_w)

# W index
w_index = 0

# Last space
last_space = dict()

# Iterate
for space in param_space:
コード例 #12
0
import torch

####################################################
# Main
####################################################

np.random.seed(1)
tf.set_random_seed(1)
torch.manual_seed(1)

# Parse args
args, use_cuda, param_space, xp = argument_parsing.parser_training()

# Load from directory
if args.inverse_dev_test:
    reutersc50_dataset, reuters_loader_train, reuters_loader_dev, reuters_loader_test = dataset.load_dataset(
        args.dataset_size, k=args.k, n_authors=args.n_authors)
else:
    reutersc50_dataset, reuters_loader_train, reuters_loader_test, reuters_loader_dev = dataset.load_dataset(
        args.dataset_size, k=args.k, n_authors=args.n_authors)
# end if

# Disable CUDA
if not args.cuda:
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
# end if

# Print authors
xp.write(u"Authors : {}".format(reutersc50_dataset.authors), log_level=0)

# Last space
last_space = dict()
コード例 #13
0
import generators as G
import math
from tools import load_glove_embeddings as gle
import os

####################################################
# Main
####################################################

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Parse args
args, use_cuda, param_space, xp = argument_parsing.parser_training()

# Load from directory
reutersc50_dataset, reuters_loader_train, reuters_loader_dev, reuters_loader_test = dataset.load_dataset(
    dataset_size=args.dataset_size, n_authors=args.n_authors, k=10)

# Print authors
xp.write(u"Authors : {}".format(reutersc50_dataset.authors), log_level=0)

# Last space
last_space = dict()

# Iterate
for space in param_space:
    # Params
    hidden_size, cell_size, feature, lang, dataset_start, window_size, learning_window, embedding_size, rnn_type, num_layers, dropout, output_dropout = functions.get_params(
        space)

    # Load GloVe
    word2index, embedding_matrix = gle.load_glove_embeddings(
コード例 #14
0
    # end if

    return esn_c3


# end create_esn_models

####################################################
# Main function
####################################################

# Parse args
args, use_cuda, param_space, xp = argument_parsing.parser_esn_training()

# Load from directory
sfgram_dataset_wv, sfgram_loader_train_wv, sfgram_loader_dev_wv, sfgram_loader_test_wv = dataset.load_dataset(
    args.author, 'wv', remove_authors=args.remove_authors)
sfgram_dataset_c3, sfgram_loader_train_c3, sfgram_loader_dev_c3, sfgram_loader_test_c3 = dataset.load_dataset(
    args.author, 'c3', remove_authors=args.remove_authors)

# Print authors
xp.write(u"Author : {}".format(sfgram_dataset_wv.author), log_level=0)
xp.write(u"Texts : {}".format(len(sfgram_dataset_wv.texts)), log_level=0)

# W index
w_index = 0

# Last space
last_space = dict()

# Threshold
n_threshold = 200
コード例 #15
0
import argparse
import torchlanguage.transforms
import codecs

####################################################
# Main
####################################################

# Parse args
parser = argparse.ArgumentParser()
parser.add_argument("--datadir", type=str)
parser.add_argument("--k", type=int, default=10)
args = parser.parse_args()

# Load from directory
reutersc50_dataset, reuters_loader_train, reuters_loader_dev, reuters_loader_test = dataset.load_dataset(
    k=args.k, n_authors=15)

# For each fold
for k in range(args.k):
    # Fold paths
    fold_dir = os.path.join(args.datadir, u"k{}".format(k))
    fold_train_dir = os.path.join(fold_dir, u"train")
    fold_test_dir = os.path.join(fold_dir, u"test")

    # Create directories
    try:
        os.mkdir(fold_dir)
        os.mkdir(fold_train_dir)
        os.mkdir(fold_test_dir)
    except OSError:
        pass
コード例 #16
0
                  type=str,
                  help="Experiment's output directory",
                  required=True,
                  extended=False)
args.add_argument(command="--verbose",
                  name="verbose",
                  type=int,
                  help="Verbose level",
                  default=2,
                  extended=False)

# Parse arguments
args.parse()

# Load from directory
sfgram_dataset, sfgram_loader_train, sfgram_loader_test = dataset.load_dataset(
    args.author, '')

# Experiment
xp = nsNLP.tools.ResultManager\
(
    args.output,
    args.name,
    args.description,
    args.get_space(),
    1,
    args.k,
    verbose=args.verbose
)

# Average
average_k_fold = np.array([])
コード例 #17
0
    """
    return confusion_matrix[0, 0] / (confusion_matrix[0, 0] +
                                     confusion_matrix[0, 1])


# end compute_accuracy

####################################################
# Main function
####################################################

# Parse args
args, use_cuda, param_space, xp = argument_parsing.parser_esn_training()

# Load from directory
sfgram_dataset, sfgram_loader_train, sfgram_loader_dev, sfgram_loader_test = dataset.load_dataset(
    args.author, args.transformer[0][0][0], remove_authors=args.remove_authors)

# Print authors
xp.write(u"Author : {}".format(sfgram_dataset.author), log_level=0)
xp.write(u"Texts : {}".format(len(sfgram_dataset.texts)), log_level=0)

# W index
w_index = 0

# Last space
last_space = dict()

# Iterate
for space in param_space:
    # Params
    reservoir_size, w_sparsity, leak_rate, input_scaling, \
コード例 #18
0
import os
import matplotlib.pyplot as plt

####################################################
# Main
####################################################


# Parse args
args, use_cuda, param_space, xp = argument_parsing.parser_training()

# Load from directory
if args.inverse_dev_test:
    reutersc50_dataset, reuters_loader_train, reuters_loader_dev, reuters_loader_test = dataset.load_dataset(
        args.dataset_size,
        k=args.k,
        n_authors=args.n_authors,
        features=args.precomputed_features
    )
else:
    reutersc50_dataset, reuters_loader_train, reuters_loader_test, reuters_loader_dev = dataset.load_dataset(
        args.dataset_size,
        k=args.k,
        n_authors=args.n_authors,
        features=args.precomputed_features
    )
# end if

# Disable CUDA
if not args.cuda:
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
# end if