コード例 #1
0
ファイル: mlbl_word2vec.py プロジェクト: caomw/im2text
 def perplexity(self, ngrams, word_dict, Im=None, context=5):
     """
     Compute the perplexity of ngrams from net
     """
     ll = 0
     N = 0
     x = T.matrix('x', dtype='int32')
     im = T.matrix('im')
     forward_T = theano.function([x, im], self.forward(x, im))
     for i, ng in enumerate(ngrams):
         instances = lm_tools.model_inputs([ng], word_dict)
         if Im != None:
             ll += self.compute_ll(instances.astype(np.int32), np.tile(Im[i], (len(ng), 1)).astype(theano.config.floatX), forward_T)
         else:
             ll += self.compute_ll(instances)
         N += len(instances)
     return np.power(2, (-1.0 / N) * ll)
コード例 #2
0
 def perplexity(self, ngrams, word_dict, Im=None, context=5):
     """
     Compute the perplexity of ngrams from net
     """
     ll = 0
     N = 0
     x = T.matrix('x', dtype='int32')
     im = T.matrix('im')
     forward_T = theano.function([x, im], self.forward(x, im))
     for i, ng in enumerate(ngrams):
         instances = lm_tools.model_inputs([ng], word_dict)
         if Im != None:
             ll += self.compute_ll(
                 instances.astype(np.int32),
                 np.tile(Im[i], (len(ng), 1)).astype(theano.config.floatX),
                 forward_T)
         else:
             ll += self.compute_ll(instances)
         N += len(instances)
     return np.power(2, (-1.0 / N) * ll)
コード例 #3
0
def process(context=5):
    """
    Main process function
    """
    # Load images
    print 'Loading images...'
    (trainIM, devIM, testIM) = load_features_npy()

    # Load sentences
    print 'Loading sentences...'
    d = load_sentences()

    # Load image ids
    print 'Loading image ids...'
    (dx_train, dx_dev) = image_ids()

    # Load splits
    print 'Loading splits...'
    (train_sp, dev_sp, test_sp) = load_splits()

    # Load captions
    print 'Loading captions...'
    train = construct_captions(d, train_sp)
    dev = construct_captions(d, dev_sp)
    test = construct_captions(d, test_sp)

    # Tokenize
    (train_tokens, topwords) = tokenize(train, context=context)
    dev_tokens = tokenize(dev, context=context, topwords=topwords)[0]
    test_tokens = tokenize(test, context=context, topwords=topwords)[0]

    # Index words and create vocabulary
    print 'Creating vocabulary...'
    (word_dict, index_dict) = index_words(train_tokens + dev_tokens)

    # Compute n-grams
    print 'Computing n-grams...'
    train_ngrams = lm_tools.get_ngrams(train_tokens, context=context)
    dev_ngrams = lm_tools.get_ngrams(dev_tokens, context=context)
    test_ngrams = lm_tools.get_ngrams(test_tokens, context=context)

    # Compute sparse label matrix
    print 'Computing labels...'
    train_labels = compute_labels(train_ngrams, word_dict, context=context)
    dev_labels = compute_labels(dev_ngrams, word_dict, context=context)

    # Compute model instances
    print 'Computing model instances...'
    (train_instances, train_index) = lm_tools.model_inputs(train_ngrams,
                                                           word_dict,
                                                           context=context,
                                                           include_last=False,
                                                           include_index=True)
    (dev_instances, dev_index) = lm_tools.model_inputs(dev_ngrams,
                                                       word_dict,
                                                       context=context,
                                                       include_last=False,
                                                       include_index=True)
    (test_instances, test_index) = lm_tools.model_inputs(test_ngrams,
                                                         word_dict,
                                                         context=context,
                                                         include_last=False,
                                                         include_index=True)

    # Save everything into dictionaries
    print 'Packing up...'
    z = {}
    z['text'] = train
    z['tokens'] = train_tokens
    z['word_dict'] = word_dict
    z['index_dict'] = index_dict
    z['ngrams'] = train_ngrams
    z['labels'] = train_labels
    z['instances'] = train_instances
    z['IM'] = trainIM
    z['index'] = train_index
    z['context'] = context

    zd = {}
    zd['text'] = dev
    zd['tokens'] = dev_tokens
    zd['ngrams'] = dev_ngrams
    zd['labels'] = dev_labels
    zd['instances'] = dev_instances
    zd['IM'] = devIM
    zd['index'] = dev_index
    zd['context'] = context

    zt = {}
    zt['text'] = test
    zt['tokens'] = test_tokens
    zt['ngrams'] = test_ngrams
    zt['instances'] = test_instances
    zt['IM'] = testIM
    zt['index'] = test_index
    zt['context'] = context

    return (z, zd, zt)
コード例 #4
0
ファイル: proc.py プロジェクト: SunnyWay/im2txtDemo
def process():
    """
    Specify the following:
    """
    ##################################
    train_captions = os.getcwd() + '/mnlm/engine/iaprtc12/train_captions.txt'
    train_images = os.getcwd() + '/mnlm/engine/iaprtc12/train_hidden7.txt'
    test_captions = os.getcwd() + '/mnlm/engine/iaprtc12/test_captions.txt'
    test_images = os.getcwd() + '/mnlm/engine/iaprtc12/test_hidden7.txt'
    context = 5
    ##################################

    # Load captions
    print 'Loading captions...'
    train = load_captions(train_captions)
    test = load_captions(test_captions)

    # Tokenize the data
    print 'Tokenizing...'
    train_tokens = tokenize(train, context=context)
    test_tokens = tokenize(test, context=context)

    # Index words and create vocabulary
    print 'Creating vocabulary...'
    (word_dict, index_dict) = index_words(train_tokens) # build a map between words and indexes
    
    # Compute n-grams
    print 'Computing n-grams...'
    train_ngrams = lm_tools.get_ngrams(train_tokens, context=context) # compute all tuple of len context+1, group in caption
    test_ngrams = lm_tools.get_ngrams(test_tokens, context=context)

    # Compute sparse label matrix
    print 'Computing labels...'
    labels = compute_labels(train_ngrams, word_dict, context=context)

    # Compute model instances
    print 'Computing model instances...'
    (train_instances, train_index) = lm_tools.model_inputs(train_ngrams, word_dict,
        context=context, include_last=False, include_index=True)
    (test_instances, test_index) = lm_tools.model_inputs(test_ngrams, word_dict,
        context=context, include_last=False, include_index=True)

    # Load image features
    print 'Loading image features...'
    trainIM = load_convfeatures(train_images)
    testIM = load_convfeatures(test_images)

    # Save everything into dictionaries
    print 'Packing up...'
    z = {}
    z['text'] = train
    z['tokens'] = train_tokens
    z['word_dict'] = word_dict
    z['index_dict'] = index_dict
    z['ngrams'] = train_ngrams
    z['labels'] = labels
    z['instances'] = train_instances
    z['IM'] = trainIM
    z['index'] = train_index
    z['context'] = context

    zt = {}
    zt['text'] = test
    zt['tokens'] = test_tokens
    zt['ngrams'] = test_ngrams
    zt['instances'] = test_instances
    zt['IM'] = testIM
    zt['index'] = test_index
    zt['context'] = context

    return (z, zt)
コード例 #5
0
def process():
    """
    Specify the following:
    """
    ##################################
    train_captions = 'iaprtc12/train_captions.txt'
    train_images = 'iaprtc12/train_hidden7.txt'
    test_captions = 'iaprtc12/test_captions.txt'
    test_images = 'iaprtc12/test_hidden7.txt'
    context = 5
    ##################################

    # Load captions
    print 'Loading captions...'
    train = load_captions(train_captions)
    test = load_captions(test_captions)

    # Tokenize the data
    print 'Tokenizing...'
    train_tokens = tokenize(train, context=context)
    test_tokens = tokenize(test, context=context)

    # Index words and create vocabulary
    print 'Creating vocabulary...'
    (word_dict, index_dict) = index_words(train_tokens)

    # Compute n-grams
    print 'Computing n-grams...'
    train_ngrams = lm_tools.get_ngrams(train_tokens, context=context)
    test_ngrams = lm_tools.get_ngrams(test_tokens, context=context)

    # Compute sparse label matrix
    print 'Computing labels...'
    labels = compute_labels(train_ngrams, word_dict, context=context)

    # Compute model instances
    print 'Computing model instances...'
    (train_instances, train_index) = lm_tools.model_inputs(train_ngrams,
                                                           word_dict,
                                                           context=context,
                                                           include_last=False,
                                                           include_index=True)
    (test_instances, test_index) = lm_tools.model_inputs(test_ngrams,
                                                         word_dict,
                                                         context=context,
                                                         include_last=False,
                                                         include_index=True)

    # Load image features
    print 'Loading image features...'
    trainIM = load_convfeatures(train_images)
    testIM = load_convfeatures(test_images)

    # Save everything into dictionaries
    print 'Packing up...'
    z = {}
    z['text'] = train
    z['tokens'] = train_tokens
    z['word_dict'] = word_dict
    z['index_dict'] = index_dict
    z['ngrams'] = train_ngrams
    z['labels'] = labels
    z['instances'] = train_instances
    z['IM'] = trainIM
    z['index'] = train_index
    z['context'] = context

    zt = {}
    zt['text'] = test
    zt['tokens'] = test_tokens
    zt['ngrams'] = test_ngrams
    zt['instances'] = test_instances
    zt['IM'] = testIM
    zt['index'] = test_index
    zt['context'] = context

    return (z, zt)