def main(): parser = argparse.ArgumentParser(description="") # Required arguments parser.add_argument( "image_feature_file_train", type=str, help="Image Feature file for the training set", ) parser.add_argument( "text_feature_file_train", type=str, help="Text Feature file for the training set", ) parser.add_argument( "image_feature_file_test", type=str, help="Image Feature file for the test set", ) parser.add_argument( "text_feature_file_test", type=str, help="Text Feature file for the test set", ) # Optional arguments parser.add_argument( "--output_directory", "-d", dest="output_directory", type=str, help="Directory to save the checkpoint files in", default=None, ) parser.add_argument( "--output_prefix", "-p", dest="output_prefix", type=str, help="Text to prefix the checkpoint files with", default="single_layer_perceptron_with_negative_sampling", ) parser.add_argument( "--batch_size", "-b", dest="batch_size", type=int, help="Batch size [default 128]", default=128, ) parser.add_argument( "--hidden_layer_size", "-s", dest="hidden_layer_size", type=int, help="Size of the hidden layer [default 100]", default=100, ) parser.add_argument( "--learning_rate", "-l", dest="learning_rate", type=float, help="Learning rate [default 0.001]", default=0.001, ) parser.add_argument( "--negative_sample_count", "-n", dest="negative_sample_count", type=int, help= "The number of negative samples to generate per image [default 10]", default=10, ) parser.add_argument( "--training_epochs", "-e", dest="training_epochs", type=int, help="The number of training epochs [default 10]", default=10, ) args = parser.parse_args() # Read in the data and set up the text encoding train_image_feature_file = args.image_feature_file_train train_text_feature_file = args.text_feature_file_train train_dataset = Dataset(train_image_feature_file, train_text_feature_file) test_image_feature_file = args.image_feature_file_test test_text_feature_file = args.text_feature_file_test test_dataset = Dataset(test_image_feature_file, test_text_feature_file) onehot_encoding = OneHot([train_dataset, test_dataset]) # Train the network run_the_net(train_dataset, test_dataset, onehot_encoding, args.output_directory, args.output_prefix, args.batch_size, args.hidden_layer_size, args.learning_rate, args.negative_sample_count, args.training_epochs)
def main(): import os parser = argparse.ArgumentParser(description='Two layer linear regression') parser.add_argument("image_feature_file_train", type=str, help="Image Feature file for the training set") parser.add_argument("text_feature_file_train", type=str, help="Text Feature file for the training set") parser.add_argument("image_feature_file_test", type=str, help="Image Feature file for the test set") parser.add_argument("text_feature_file_test", type=str, help="Text Feature file for the test set") parser.add_argument("word_vector_file", type=str, help="Text file containing the word vectors") # Optional Args parser.add_argument("--learning_rate", type=float, default=.05, help="Learning Rate") parser.add_argument("--epochs", type=int, default=200, help="Number of epochs to run for") parser.add_argument("--batch_size", type=int, default=128, help="Batch size to use for training") args = parser.parse_args() train_dataset = Dataset(args.image_feature_file_train, args.text_feature_file_train) test_dataset = Dataset(args.image_feature_file_test, args.text_feature_file_test) # Get the full vocab so we can extract only the word vectors we care about dataset_tags = set() for dataset in [train_dataset, test_dataset]: for tags in dataset.text_feats.values(): dataset_tags.update(tags) # Read w2vec w2v_lookup = {} if os.path.exists(args.word_vector_file): if args.word_vector_file.endswith('.gz'): input_file = gzip.open(args.word_vector_file) else: input_file = open(args.word_vector_file) for i, line in enumerate(input_file): first_word = line[:line.find(' ')] if first_word in dataset_tags: line = line.strip().split(' ') w2v_vector = np.array([float(j) for j in line[1:]]) # Normalize vector before storing w2v_lookup[line[0]] = w2v_vector / np.linalg.norm(w2v_vector) train_model(train_dataset, test_dataset, w2v_lookup, batch_size=args.batch_size, num_epochs=args.epochs)
def main(): parser = argparse.ArgumentParser(description='Linear regression') # Required args parser.add_argument("image_feature_file_train", type=str, help="Image Feature file for the training set") parser.add_argument("text_feature_file_train", type=str, help="Text Feature file for the training set") parser.add_argument("image_feature_file_test", type=str, help="Image Feature file for the test set") parser.add_argument("text_feature_file_test", type=str, help="Text Feature file for the test set") parser.add_argument("word_vector_file", type=str, help="Text file containing the word vectors") # Optional Args parser.add_argument("--word_vector_type", choices=("word2vec", "glove"), default="word2vec", help="Word vector type") parser.add_argument("--logging_level", choices=("debug", "info", "warning", "error"), default="warning", help="Python logging level") args = parser.parse_args() logger.setLevel(getattr(logging, args.logging_level.upper())) logger.info("Parsing train and test datasets.") train_dataset = Dataset(args.image_feature_file_train, args.text_feature_file_train) test_dataset = Dataset(args.image_feature_file_test, args.text_feature_file_test) logger.info("Reading word vectors from file.") if args.word_vector_type == "glove": from glove import Glove glove_model = Glove.load_stanford(args.word_vector_file) w2v_model = GloveWrapper(glove_model) else: # args.word_vector_type == "word2vec" (default) import word2vec w2v_model = W2VWrapper(word2vec.load(args.word_vector_file)) logger.info("Creating one hot tag mapper.") one_hot = OneHot([train_dataset, test_dataset], valid_vocab=w2v_model.vocab) logger.info("Creating w2v transformer.") w2v_transformer = NaiveW2V(one_hot, w2v_model, vocab=one_hot.keys()) logger.info("Preparing train data from train datasets.") train_x, train_y = get_xy(train_dataset, tag_transformer=one_hot) logger.info("Transforming y using w2v transformer.") transformed_y = w2v_transformer.transform(train_y) train_data = (train_x, transformed_y) logger.info("Preparing test data from test dataset.") test_data = get_xy(test_dataset, tag_transformer=one_hot) logger.info("Training model.") model = train(train_data, test_data, interpreter=w2v_transformer) logger.info("Done.")
import sys sys.path.append('/home/kni/local-kni/attalos') from attalos.dataset.dataset import Dataset # from attalos.dataset.transformers.onehot import OneHot import numpy as np import json datadir='/data/fs4/teams/attalos/features/' imdata=datadir+'image/visualgenome_train_20160816_inception.hdf5' txdata=datadir+'text/visualgenome_train_20160816_text.json.gz' dirTr = '/data/fs4/datasets/vg_unzipped/' splits = json.load(open('densecap_splits.json')) ## Using our dataset iterators, load in the text data alldata = Dataset(imdata,txdata) for key in alldata.text_feats: words = alldata.text_feats[key] words = [ word.split()[-1].lower() for word in words ] alldata.text_feats[key] = words del key,words,word ## Create an unordered vocabulary with counts # Full vocabulary is stored in `vocab`, with `counts` def getvocab( text_feats ): vocab = dict() for key in text_feats: words = text_feats[key] for word in set(words):