def perplexity(self, ngrams, word_dict, Im=None, context=5): """ Compute the perplexity of ngrams from net """ ll = 0 N = 0 x = T.matrix('x', dtype='int32') im = T.matrix('im') forward_T = theano.function([x, im], self.forward(x, im)) for i, ng in enumerate(ngrams): instances = lm_tools.model_inputs([ng], word_dict) if Im != None: ll += self.compute_ll(instances.astype(np.int32), np.tile(Im[i], (len(ng), 1)).astype(theano.config.floatX), forward_T) else: ll += self.compute_ll(instances) N += len(instances) return np.power(2, (-1.0 / N) * ll)
def perplexity(self, ngrams, word_dict, Im=None, context=5): """ Compute the perplexity of ngrams from net """ ll = 0 N = 0 x = T.matrix('x', dtype='int32') im = T.matrix('im') forward_T = theano.function([x, im], self.forward(x, im)) for i, ng in enumerate(ngrams): instances = lm_tools.model_inputs([ng], word_dict) if Im != None: ll += self.compute_ll( instances.astype(np.int32), np.tile(Im[i], (len(ng), 1)).astype(theano.config.floatX), forward_T) else: ll += self.compute_ll(instances) N += len(instances) return np.power(2, (-1.0 / N) * ll)
def process(context=5): """ Main process function """ # Load images print 'Loading images...' (trainIM, devIM, testIM) = load_features_npy() # Load sentences print 'Loading sentences...' d = load_sentences() # Load image ids print 'Loading image ids...' (dx_train, dx_dev) = image_ids() # Load splits print 'Loading splits...' (train_sp, dev_sp, test_sp) = load_splits() # Load captions print 'Loading captions...' train = construct_captions(d, train_sp) dev = construct_captions(d, dev_sp) test = construct_captions(d, test_sp) # Tokenize (train_tokens, topwords) = tokenize(train, context=context) dev_tokens = tokenize(dev, context=context, topwords=topwords)[0] test_tokens = tokenize(test, context=context, topwords=topwords)[0] # Index words and create vocabulary print 'Creating vocabulary...' (word_dict, index_dict) = index_words(train_tokens + dev_tokens) # Compute n-grams print 'Computing n-grams...' train_ngrams = lm_tools.get_ngrams(train_tokens, context=context) dev_ngrams = lm_tools.get_ngrams(dev_tokens, context=context) test_ngrams = lm_tools.get_ngrams(test_tokens, context=context) # Compute sparse label matrix print 'Computing labels...' train_labels = compute_labels(train_ngrams, word_dict, context=context) dev_labels = compute_labels(dev_ngrams, word_dict, context=context) # Compute model instances print 'Computing model instances...' (train_instances, train_index) = lm_tools.model_inputs(train_ngrams, word_dict, context=context, include_last=False, include_index=True) (dev_instances, dev_index) = lm_tools.model_inputs(dev_ngrams, word_dict, context=context, include_last=False, include_index=True) (test_instances, test_index) = lm_tools.model_inputs(test_ngrams, word_dict, context=context, include_last=False, include_index=True) # Save everything into dictionaries print 'Packing up...' z = {} z['text'] = train z['tokens'] = train_tokens z['word_dict'] = word_dict z['index_dict'] = index_dict z['ngrams'] = train_ngrams z['labels'] = train_labels z['instances'] = train_instances z['IM'] = trainIM z['index'] = train_index z['context'] = context zd = {} zd['text'] = dev zd['tokens'] = dev_tokens zd['ngrams'] = dev_ngrams zd['labels'] = dev_labels zd['instances'] = dev_instances zd['IM'] = devIM zd['index'] = dev_index zd['context'] = context zt = {} zt['text'] = test zt['tokens'] = test_tokens zt['ngrams'] = test_ngrams zt['instances'] = test_instances zt['IM'] = testIM zt['index'] = test_index zt['context'] = context return (z, zd, zt)
def process(): """ Specify the following: """ ################################## train_captions = os.getcwd() + '/mnlm/engine/iaprtc12/train_captions.txt' train_images = os.getcwd() + '/mnlm/engine/iaprtc12/train_hidden7.txt' test_captions = os.getcwd() + '/mnlm/engine/iaprtc12/test_captions.txt' test_images = os.getcwd() + '/mnlm/engine/iaprtc12/test_hidden7.txt' context = 5 ################################## # Load captions print 'Loading captions...' train = load_captions(train_captions) test = load_captions(test_captions) # Tokenize the data print 'Tokenizing...' train_tokens = tokenize(train, context=context) test_tokens = tokenize(test, context=context) # Index words and create vocabulary print 'Creating vocabulary...' (word_dict, index_dict) = index_words(train_tokens) # build a map between words and indexes # Compute n-grams print 'Computing n-grams...' train_ngrams = lm_tools.get_ngrams(train_tokens, context=context) # compute all tuple of len context+1, group in caption test_ngrams = lm_tools.get_ngrams(test_tokens, context=context) # Compute sparse label matrix print 'Computing labels...' labels = compute_labels(train_ngrams, word_dict, context=context) # Compute model instances print 'Computing model instances...' (train_instances, train_index) = lm_tools.model_inputs(train_ngrams, word_dict, context=context, include_last=False, include_index=True) (test_instances, test_index) = lm_tools.model_inputs(test_ngrams, word_dict, context=context, include_last=False, include_index=True) # Load image features print 'Loading image features...' trainIM = load_convfeatures(train_images) testIM = load_convfeatures(test_images) # Save everything into dictionaries print 'Packing up...' z = {} z['text'] = train z['tokens'] = train_tokens z['word_dict'] = word_dict z['index_dict'] = index_dict z['ngrams'] = train_ngrams z['labels'] = labels z['instances'] = train_instances z['IM'] = trainIM z['index'] = train_index z['context'] = context zt = {} zt['text'] = test zt['tokens'] = test_tokens zt['ngrams'] = test_ngrams zt['instances'] = test_instances zt['IM'] = testIM zt['index'] = test_index zt['context'] = context return (z, zt)
def process(): """ Specify the following: """ ################################## train_captions = 'iaprtc12/train_captions.txt' train_images = 'iaprtc12/train_hidden7.txt' test_captions = 'iaprtc12/test_captions.txt' test_images = 'iaprtc12/test_hidden7.txt' context = 5 ################################## # Load captions print 'Loading captions...' train = load_captions(train_captions) test = load_captions(test_captions) # Tokenize the data print 'Tokenizing...' train_tokens = tokenize(train, context=context) test_tokens = tokenize(test, context=context) # Index words and create vocabulary print 'Creating vocabulary...' (word_dict, index_dict) = index_words(train_tokens) # Compute n-grams print 'Computing n-grams...' train_ngrams = lm_tools.get_ngrams(train_tokens, context=context) test_ngrams = lm_tools.get_ngrams(test_tokens, context=context) # Compute sparse label matrix print 'Computing labels...' labels = compute_labels(train_ngrams, word_dict, context=context) # Compute model instances print 'Computing model instances...' (train_instances, train_index) = lm_tools.model_inputs(train_ngrams, word_dict, context=context, include_last=False, include_index=True) (test_instances, test_index) = lm_tools.model_inputs(test_ngrams, word_dict, context=context, include_last=False, include_index=True) # Load image features print 'Loading image features...' trainIM = load_convfeatures(train_images) testIM = load_convfeatures(test_images) # Save everything into dictionaries print 'Packing up...' z = {} z['text'] = train z['tokens'] = train_tokens z['word_dict'] = word_dict z['index_dict'] = index_dict z['ngrams'] = train_ngrams z['labels'] = labels z['instances'] = train_instances z['IM'] = trainIM z['index'] = train_index z['context'] = context zt = {} zt['text'] = test zt['tokens'] = test_tokens zt['ngrams'] = test_ngrams zt['instances'] = test_instances zt['IM'] = testIM zt['index'] = test_index zt['context'] = context return (z, zt)