예제 #1
0
def main():
    parser = argparse.ArgumentParser(description="")

    # Required arguments
    parser.add_argument(
        "image_feature_file_train",
        type=str,
        help="Image Feature file for the training set",
    )
    parser.add_argument(
        "text_feature_file_train",
        type=str,
        help="Text Feature file for the training set",
    )
    parser.add_argument(
        "image_feature_file_test",
        type=str,
        help="Image Feature file for the test set",
    )
    parser.add_argument(
        "text_feature_file_test",
        type=str,
        help="Text Feature file for the test set",
    )

    # Optional arguments
    parser.add_argument(
        "--output_directory",
        "-d",
        dest="output_directory",
        type=str,
        help="Directory to save the checkpoint files in",
        default=None,
    )
    parser.add_argument(
        "--output_prefix",
        "-p",
        dest="output_prefix",
        type=str,
        help="Text to prefix the checkpoint files with",
        default="single_layer_perceptron_with_negative_sampling",
    )
    parser.add_argument(
        "--batch_size",
        "-b",
        dest="batch_size",
        type=int,
        help="Batch size [default 128]",
        default=128,
    )
    parser.add_argument(
        "--hidden_layer_size",
        "-s",
        dest="hidden_layer_size",
        type=int,
        help="Size of the hidden layer [default 100]",
        default=100,
    )
    parser.add_argument(
        "--learning_rate",
        "-l",
        dest="learning_rate",
        type=float,
        help="Learning rate [default 0.001]",
        default=0.001,
    )
    parser.add_argument(
        "--negative_sample_count",
        "-n",
        dest="negative_sample_count",
        type=int,
        help=
        "The number of negative samples to generate per image [default 10]",
        default=10,
    )
    parser.add_argument(
        "--training_epochs",
        "-e",
        dest="training_epochs",
        type=int,
        help="The number of training epochs [default 10]",
        default=10,
    )

    args = parser.parse_args()

    # Read in the data and set up the text encoding
    train_image_feature_file = args.image_feature_file_train
    train_text_feature_file = args.text_feature_file_train
    train_dataset = Dataset(train_image_feature_file, train_text_feature_file)

    test_image_feature_file = args.image_feature_file_test
    test_text_feature_file = args.text_feature_file_test
    test_dataset = Dataset(test_image_feature_file, test_text_feature_file)

    onehot_encoding = OneHot([train_dataset, test_dataset])

    # Train the network
    run_the_net(train_dataset, test_dataset, onehot_encoding,
                args.output_directory, args.output_prefix, args.batch_size,
                args.hidden_layer_size, args.learning_rate,
                args.negative_sample_count, args.training_epochs)
예제 #2
0
def main():
    import os
    parser = argparse.ArgumentParser(description='Two layer linear regression')
    parser.add_argument("image_feature_file_train",
                        type=str,
                        help="Image Feature file for the training set")
    parser.add_argument("text_feature_file_train",
                        type=str,
                        help="Text Feature file for the training set")
    parser.add_argument("image_feature_file_test",
                        type=str,
                        help="Image Feature file for the test set")
    parser.add_argument("text_feature_file_test",
                        type=str,
                        help="Text Feature file for the test set")
    parser.add_argument("word_vector_file",
                        type=str,
                        help="Text file containing the word vectors")

    # Optional Args
    parser.add_argument("--learning_rate",
                        type=float,
                        default=.05,
                        help="Learning Rate")
    parser.add_argument("--epochs",
                        type=int,
                        default=200,
                        help="Number of epochs to run for")
    parser.add_argument("--batch_size",
                        type=int,
                        default=128,
                        help="Batch size to use for training")

    args = parser.parse_args()
    train_dataset = Dataset(args.image_feature_file_train,
                            args.text_feature_file_train)
    test_dataset = Dataset(args.image_feature_file_test,
                           args.text_feature_file_test)

    # Get the full vocab so we can extract only the word vectors we care about
    dataset_tags = set()
    for dataset in [train_dataset, test_dataset]:
        for tags in dataset.text_feats.values():
            dataset_tags.update(tags)

    # Read w2vec
    w2v_lookup = {}
    if os.path.exists(args.word_vector_file):
        if args.word_vector_file.endswith('.gz'):
            input_file = gzip.open(args.word_vector_file)
        else:
            input_file = open(args.word_vector_file)
    for i, line in enumerate(input_file):
        first_word = line[:line.find(' ')]
        if first_word in dataset_tags:
            line = line.strip().split(' ')
            w2v_vector = np.array([float(j) for j in line[1:]])
            # Normalize vector before storing
            w2v_lookup[line[0]] = w2v_vector / np.linalg.norm(w2v_vector)

    train_model(train_dataset,
                test_dataset,
                w2v_lookup,
                batch_size=args.batch_size,
                num_epochs=args.epochs)
예제 #3
0
def main():
    parser = argparse.ArgumentParser(description='Linear regression')

    # Required args
    parser.add_argument("image_feature_file_train",
                        type=str,
                        help="Image Feature file for the training set")
    parser.add_argument("text_feature_file_train",
                        type=str,
                        help="Text Feature file for the training set")
    parser.add_argument("image_feature_file_test",
                        type=str,
                        help="Image Feature file for the test set")
    parser.add_argument("text_feature_file_test",
                        type=str,
                        help="Text Feature file for the test set")
    parser.add_argument("word_vector_file",
                        type=str,
                        help="Text file containing the word vectors")

    # Optional Args
    parser.add_argument("--word_vector_type",
                        choices=("word2vec", "glove"),
                        default="word2vec",
                        help="Word vector type")
    parser.add_argument("--logging_level",
                        choices=("debug", "info", "warning", "error"),
                        default="warning",
                        help="Python logging level")

    args = parser.parse_args()

    logger.setLevel(getattr(logging, args.logging_level.upper()))

    logger.info("Parsing train and test datasets.")
    train_dataset = Dataset(args.image_feature_file_train,
                            args.text_feature_file_train)
    test_dataset = Dataset(args.image_feature_file_test,
                           args.text_feature_file_test)

    logger.info("Reading word vectors from file.")
    if args.word_vector_type == "glove":
        from glove import Glove
        glove_model = Glove.load_stanford(args.word_vector_file)
        w2v_model = GloveWrapper(glove_model)
    else:  # args.word_vector_type == "word2vec" (default)
        import word2vec
        w2v_model = W2VWrapper(word2vec.load(args.word_vector_file))

    logger.info("Creating one hot tag mapper.")
    one_hot = OneHot([train_dataset, test_dataset],
                     valid_vocab=w2v_model.vocab)

    logger.info("Creating w2v transformer.")
    w2v_transformer = NaiveW2V(one_hot, w2v_model, vocab=one_hot.keys())

    logger.info("Preparing train data from train datasets.")
    train_x, train_y = get_xy(train_dataset, tag_transformer=one_hot)

    logger.info("Transforming y using w2v transformer.")
    transformed_y = w2v_transformer.transform(train_y)
    train_data = (train_x, transformed_y)

    logger.info("Preparing test data from test dataset.")
    test_data = get_xy(test_dataset, tag_transformer=one_hot)

    logger.info("Training model.")
    model = train(train_data, test_data, interpreter=w2v_transformer)
    logger.info("Done.")
예제 #4
0
import sys
sys.path.append('/home/kni/local-kni/attalos')
from attalos.dataset.dataset import Dataset
# from attalos.dataset.transformers.onehot import OneHot
import numpy as np
import json


datadir='/data/fs4/teams/attalos/features/'
imdata=datadir+'image/visualgenome_train_20160816_inception.hdf5'
txdata=datadir+'text/visualgenome_train_20160816_text.json.gz'
dirTr = '/data/fs4/datasets/vg_unzipped/'
splits = json.load(open('densecap_splits.json'))                                                                                    

## Using our dataset iterators, load in the text data
alldata = Dataset(imdata,txdata)
for key in alldata.text_feats:
    words = alldata.text_feats[key]
    words = [ word.split()[-1].lower() for word in words ]
    alldata.text_feats[key] = words
del key,words,word


## Create an unordered vocabulary with counts
#  Full vocabulary is stored in `vocab`, with `counts`
def getvocab( text_feats ):
    
    vocab = dict()
    for key in text_feats:
        words = text_feats[key]
        for word in set(words):