raise Exception(
                "To train probing model, either pass load-serialization-dir "
                "or pass layer-num and base-model-dir")

    # Set numpy, tensorflow and python seeds for reproducibility.
    tf.random.set_seed(1337)
    np.random.seed(1337)
    random.seed(13370)

    # Set some constants
    MAX_NUM_TOKENS = 250
    VOCAB_SIZE = 10000
    GLOVE_COMMON_WORDS_PATH = os.path.join("data", "glove_common_words.txt")

    print("Reading training instances.")
    train_instances = read_instances(args.train_data_file_path, MAX_NUM_TOKENS)
    print("Reading validation instances.")
    validation_instances = read_instances(args.validation_data_file_path,
                                          MAX_NUM_TOKENS)

    if args.load_serialization_dir:
        print(f"Ignoring the model arguments and loading the "
              f"model from serialization_dir: {args.load_serialization_dir}")

        # Load Vocab
        vocab_path = os.path.join(args.load_serialization_dir, "vocab.txt")
        vocab_token_to_id, vocab_id_to_token = load_vocabulary(vocab_path)

        # Load Model
        classifier = load_pretrained_model(args.load_serialization_dir)
    else:
    parser.add_argument('--epochs', type=int, help="num epochs", default=10)
    parser.add_argument('--embed-file', type=str, help="embedding location", default='./data/glove.6B.100D.txt')
    parser.add_argument('--embed-dim', type=int, help="size of embeddings", default=100)

    args = parser.parse_args()

    tf.random.set_seed(1337)
    np.random.seed(1337)
    random.seed(13370)

    MAX_TOKENS = 250
    VOCAB_SIZE = 10000
    GLOVE_COMMON_WORDS_PATH = os.path.join("data", "glove_common_words.txt")

    print(f"\nReading Train Instances")
    train_instances = read_instances(args.data_file, MAX_TOKENS)
    print(f"\nReading Val Instances")
    val_instances = read_instances(args.val_file, MAX_TOKENS)

    with open(GLOVE_COMMON_WORDS_PATH,encoding='UTF-8') as file:
        glove_common_words = [line.strip() for line in file.readlines() if line.strip()]

    vocab_token_to_id, vocab_id_to_token = build_vocabulary(train_instances, VOCAB_SIZE,
                                                            glove_common_words)
    vocab_size = len(np.unique(vocab_token_to_id.keys())[0])

    train_instances = index_instances(train_instances, vocab_token_to_id)
    val_instances = index_instances(val_instances, vocab_token_to_id)

    ### TODO(Students) START
    # make a config file here as expected by your MyAdvancedModel
Beispiel #3
0
                        type=str,
                        help='Location of test file',
                        default='data/test.txt')
    parser.add_argument('--prediction-file',
                        type=str,
                        help="Location of output file")
    parser.add_argument('--batch-size',
                        type=int,
                        help="size of batch",
                        default=32)

    args = parser.parse_args()

    MAX_NUM_TOKENS = 250
    test_instances = read_instances(args.data_file_path,
                                    MAX_NUM_TOKENS,
                                    test=True)

    vocabulary_path = os.path.join(args.load_serialization_dir, "vocab.txt")
    vocab_token_to_id, _ = load_vocabulary(vocabulary_path)

    test_instances = index_instances(test_instances, vocab_token_to_id)

    # load config
    config_path = os.path.join(args.load_serialization_dir, "config.json")
    with open(config_path, 'r') as f:
        config = json.load(f)

    # load model
    model = load_pretrained_model(args.load_serialization_dir)
Beispiel #4
0
if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Predict with trained Main/Probing Model')

    parser.add_argument('load_serialization_dir', type=str,
                             help='serialization directory from which to load the trained model.')
    parser.add_argument('data_file_path', type=str, help='data file path to predict on.')
    parser.add_argument('--predictions-file', type=str, help='output predictions file.')
    parser.add_argument('--batch-size', type=int, default=32, help='batch size')

    args = parser.parse_args()

    # Set some constants
    MAX_NUM_TOKENS = 250

    instances = read_instances(args.data_file_path, MAX_NUM_TOKENS)

    # Load Model
    classifier = load_pretrained_model(args.load_serialization_dir)

    # Load Config
    config_path = os.path.join(args.load_serialization_dir, "config.json")
    with open(config_path, "r") as file:
        config = json.load(file)

    is_bert = False
    if config['type'] == 'CNN' or config['type'] == 'CNN_BiGRU':
        vocabulary_path = os.path.join(args.load_serialization_dir, "vocab.txt")
        vocab_token_to_id, _ = load_vocabulary(vocabulary_path)
        instances = index_instances(instances, vocab_token_to_id)
    else:
    parser.add_argument('--num-epochs', type=int, default=4, help='max num epochs to train for')
    parser.add_argument('--pretrained-bert-model', type=str, default='bert-base-uncased',
                        help='if passed, use glove embeddings to initialize the embedding matrix')
    parser.add_argument('--model-choice', type=str, choices=("bert_cnn", "bert"), help='Choice of model')

    parser.add_argument('--experiment-name', type=str, default="only_bert",
                        help='optional experiment name which determines where to store the model training outputs.')

    parser.add_argument('--num-tokens', type=int, help='num_tokens ', default=16)
    parser.add_argument('--nn-hidden-dim', type=int, help='hidden_dim of fully connected neural network', default=100)
    parser.add_argument('--dropout-prob', type=float, help="dropout rate", default=0.2)

    args = parser.parse_args()

    print("Reading training instances.")
    train_instances = read_instances(args.train_data_file_path, args.num_tokens)
    print("Reading validation instances.")
    validation_instances = read_instances(args.validation_data_file_path, args.num_tokens)

    # index tokens based on bert model vocab using huggingface transformers library
    train_instances = bert_index_instances(train_instances)
    validation_instances = bert_index_instances(validation_instances)
    config = {
        "num_tokens": args.num_tokens,
        "nn_hidden_dim": args.nn_hidden_dim,
        "dropout_prob": args.dropout_prob}

    # based on model choice, build config and instantiate model
    if args.model_choice == "bert":
        model = create_vanilla_bert_model(**config)
        config["type"] = "BERT"
Beispiel #6
0
    main_model_subparser.add_argument('--hidden-units-layer3',
                                      type=int,
                                      help='hidden-units-layer3'
                                      'Required to cluster stocks.')
    """
    main_model_subparser.add_argument('--pretrained-embedding-file', type=str,
                                      help='if passed, use glove embeddings to initialize. '
                                           'the embedding matrix')
    """
    tf.random.set_seed(1337)
    np.random.seed(1337)
    random.seed(13370)
    args = parser.parse_args()

    print("Reading Training Instances")
    train_instances, max_length_train, number_of_columns_train = read_instances(
        args.train_data_file_path, 'train')

    print("Reading validation instances.")
    validation_instances, max_length_validation, number_of_columns_validation = read_instances(
        args.validation_data_file_path, 'validation')

    if args.load_serialization_dir:
        print(f"Ignoring the model arguments and loading the "
              f"model from serialization_dir: {args.load_serialization_dir}")

        model = load_pretrained_model(args.load_serialization_dir)
    else:

        if args.model_name == "main":
            config = {
                "encoder_decoder_choice":