Ejemplo n.º 1
0
    # Add data args.
    parser.add_argument('--scorer', type=str, default='scripts/evaluation/scorer_conll2005.pl')
    parser.add_argument('--processor', type=str, required=True)
    parser.add_argument('--model', type=str, required=True)
    parser.add_argument('--input', type=str, required=True)
    parser.add_argument('--output_dir', type=str, required=True)

    # Add dataloader args.
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--num_workers', type=int, default=16)

    # Store the arguments in hparams.
    args = parser.parse_args()

    processor = Processor.from_config(args.processor, viterbi_decoding=True)

    test_dataset = CoNLL(args.input)

    test_dataloader = DataLoader(
        test_dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        collate_fn=processor.collate_sentences)

    model = MultilingualSrlModel.load_from_checkpoint(args.model)
    model.eval()

    trainer = Trainer(gpus=1 if torch.cuda.is_available() else 0)
    trainer.test(model=model, test_dataloaders=test_dataloader)
Ejemplo n.º 2
0
def load_train_test_validate_dataset(
        hyperparameters: Dict[str, any], input_data_dir: str,
        reproducibility_saver: ReproducibilitySaver) -> Dict[str, any]:
    preprocessor_hyperparameters = hyperparameters['preprocessor_config']

    vocabulary = None
    returned_dict = {}

    if reproducibility_saver.trained_model_dir:
        vocabulary = reproducibility_saver.restore_vocabulary()

    # TODO make it save the tensorised value
    if reproducibility_saver.restore_data:
        # only need testing values
        restored_dirs = reproducibility_saver.restore_preprocessed_dirs(
            restore_validating_file_list=False,
            restore_training_file_list=False)
        test_data_files = restored_dirs['testing_data_files']
        testing_dataset_preprocessor = Processor(
            config=preprocessor_hyperparameters,
            data_files=test_data_files,
            vocabulary=vocabulary)
        returned_dict[
            'testing_dataset_preprocessor'] = testing_dataset_preprocessor

    else:
        print("Manually loading files from input_data_dir")
        all_files = get_data_files_from_directory(
            input_data_dir,
            skip_tests=preprocessor_hyperparameters['skip_tests'])
        print("Total # files: {}".format(len(all_files)))
        train_data_files, test_data_files = train_test_split(all_files,
                                                             train_size=0.7,
                                                             test_size=0.3)
        train_data_files, validate_data_files = train_test_split(
            train_data_files, train_size=0.9, test_size=0.1)
        print(
            "Training Data: {}, Testing Data: {}, Validating data: {}".format(
                len(train_data_files), len(test_data_files),
                len(validate_data_files)))

        training_dataset_preprocessor = Processor(
            config=preprocessor_hyperparameters,
            data_files=train_data_files,
            vocabulary=vocabulary)
        vocabulary = training_dataset_preprocessor.vocabulary
        validating_dataset_preprocessor = Processor(
            config=preprocessor_hyperparameters,
            data_files=validate_data_files,
            vocabulary=vocabulary)
        testing_dataset_preprocessor = Processor(
            config=preprocessor_hyperparameters,
            data_files=test_data_files,
            vocabulary=vocabulary)
        returned_dict[
            'training_dataset_preprocessor'] = training_dataset_preprocessor
        returned_dict[
            'validating_dataset_preprocessor'] = validating_dataset_preprocessor
        returned_dict[
            'testing_dataset_preprocessor'] = testing_dataset_preprocessor

    returned_dict['vocabulary'] = vocabulary
    return returned_dict
Ejemplo n.º 3
0
    # Store the arguments in hparams.
    hparams = parser.parse_args()

    # Initialize random generators with the given seed.
    seed_everything(hparams.seed)

    # Load the train dataset.
    train_dataset = CoNLL(hparams.train_path)

    # Load the validation dataset.
    dev_dataset = CoNLL(hparams.dev_path)

    # The processor takes care of building i/o maps, encode a sentence, and decode the output.
    processor = Processor(
        train_dataset,
        input_representation=hparams.input_representation,
        model_name=hparams.language_model)

    # Create the dataloader for the training set.
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=hparams.batch_size,
        shuffle=hparams.shuffle,
        num_workers=hparams.num_workers,
        collate_fn=processor.collate_sentences)

    # Create the dataloader for the validation set.
    dev_dataloader = DataLoader(
        dev_dataset,
        batch_size=hparams.batch_size,
        num_workers=hparams.num_workers,
Ejemplo n.º 4
0
import numpy as np
import tensorflow as tf
from tensorflow.python import keras
from tensorflow.python.keras import layers

from data.processor import Processor
from models.cnn_attention import ConvAttention

tf.enable_eager_execution()

data = Processor(config=Processor.DEFAULT_CONFIG,
                 data_dir='data/raw/r252-corpus-features/org/elasticsearch/action/admin/cluster/allocation/')

vocab = data.metadata['token_vocab']
processed = data.get_tensorise_data()

vocabulary_size = len(vocab) + 1
max_chunk_length = data.config['max_chunk_length']
code_snippet = np.expand_dims(processed['body_tokens'], -1)
label_name = np.expand_dims(processed['name_tokens'], axis=-1)

print("Vocab Size: {} number of Code snippet: {} number of labels: {}".format(vocabulary_size, len(code_snippet),
                                                                              len(label_name)))
print("Label_name shape: {}\nCode_snippet shape: {}".format(label_name.shape, code_snippet.shape))

# TODO make the input a json file and parse it
hyperparameter = {'batch_size': 1, 'k1': 8, 'k2': 8, 'w1': 24, 'w2': 29, 'w3': 10, 'dropout_rate': 0.5,
                  'max_chunk_length': max_chunk_length, 'vocabulary_size': vocabulary_size, 'embedding_dim': 128}
# Optimised hyperparameter are reported in page 5 of the paper

batch_size = hyperparameter['batch_size']
Ejemplo n.º 5
0
        "vocabulary_count_threshold": 3,
        "min_line_of_codes": 3,
        "skip_tests": True
    }
}

all_files = get_data_files_from_directory(
    data_dir='data/raw/r252-corpus-features/org/elasticsearch/action/admin',
    skip_tests=hyperparameters['preprocessor_config']['skip_tests'])
print("Total # files: {}".format(len(all_files)))
train_data_files, test_data_files = train_test_split(all_files, train_size=0.7)
train_data_files, validate_data_files = train_test_split(train_data_files,
                                                         train_size=0.9)
print("Training Data: {}, Testing Data: {}, Validating data: {}".format(
    len(train_data_files), len(test_data_files), len(validate_data_files)))
training_dataset_preprocessor = Processor(
    config=hyperparameters['preprocessor_config'], data_files=train_data_files)
validating_dataset_preprocessor = Processor(
    config=hyperparameters['preprocessor_config'],
    data_files=validate_data_files,
    vocabulary=training_dataset_preprocessor.vocabulary)
testing_dataset_preprocessor = Processor(
    config=hyperparameters['preprocessor_config'],
    data_files=test_data_files,
    vocabulary=training_dataset_preprocessor.vocabulary)

# In[5]:

vocab = training_dataset_preprocessor.vocabulary
vocabulary_size = len(vocab) + 1
max_chunk_length = training_dataset_preprocessor.config['max_chunk_length']
training_data_tensors = training_dataset_preprocessor.get_tensorise_data()