Example #1
0
 def iter_valid_batches(self):
     for bunch in grouper(self.data['valid'], self.batch_size * 20):
         bunch_sort = [
             bunch[i] for i in numpy.argsort([len(x) for x, _, _ in bunch])
         ]
         for item in grouper(bunch_sort, self.batch_size):
             yield self.batcher.batch(item)
Example #2
0
def cmd_predict_v(dataset='coco',
                  datapath='.',
                  model_path='.',
                  model_name='model.pkl.gz',
                  batch_size=128,
                  output_v='predict_v.npy',
                  output_r='predict_r.npy'):
    M = load(model_path, model_name=model_name)
    model = M['model']
    batcher = M['batcher']
    mapper = M['batcher'].mapper
    predict_v = predictor_v(model)
    predict_r = predictor_r(model)
    prov = dp.getDataProvider(dataset, root=datapath)
    sents = list(prov.iterSentences(split='val'))
    inputs = list(
        mapper.transform(
            [tokens(sent, tokenizer=batcher.tokenizer) for sent in sents]))
    print len(model.network.params())
    preds_v = numpy.vstack([
        predict_v(batcher.batch_inp(batch))
        for batch in grouper(inputs, batch_size)
    ])
    numpy.save(os.path.join(model_path, output_v), preds_v)
    preds_r = numpy.vstack([
        predict_r(batcher.batch_inp(batch))
        for batch in grouper(inputs, batch_size)
    ])
    numpy.save(os.path.join(model_path, output_r), preds_r)
Example #3
0
def cmd_predict_r(model_path='.',
                  batch_size=128,
                  split='train',
                  output_premise='predict_premise_r.npy',
                  output_hypo='predict_hypo_r.npy',
                  output_labels='entailment_labels.npy'):
    def load(f):
        return pickle.load(gzip.open(os.path.join(model_path, f)))

    model_name = 'model.pkl.gz'
    batcher, scaler, model = map(
        load, ['batcher.pkl.gz', 'scaler.pkl.gz', model_name])
    mapper = batcher.mapper
    predict_r = predictor_r(model)
    sents_premise, sents_hypo, labels = zip(*parse_snli(split=split))
    inputs_premise = list(mapper.transform(sents_premise))
    inputs_hypo = list(mapper.transform(sents_hypo))
    preds_premise_r = numpy.vstack([
        predict_r(batcher.batch_inp(batch))
        for batch in grouper(inputs_premise, batch_size)
    ])
    numpy.save(os.path.join(model_path, split + '_' + output_premise),
               preds_premise_r)
    preds_hypo_r = numpy.vstack([
        predict_r(batcher.batch_inp(batch))
        for batch in grouper(inputs_hypo, batch_size)
    ])
    numpy.save(os.path.join(model_path, split + '_' + output_hypo),
               preds_hypo_r)
    numpy.save(os.path.join(model_path, split + '_' + output_labels), labels)
Example #4
0
 def iter_train_batches(self):
     # sort data by length
     if self.curriculum:
         data = [self.data['train'][i] for i in numpy.argsort([len(x['tokens_in']) for x in self.data['train']])]
     else:
         data = self.data['train']
     for bunch in util.grouper(data, self.batch_size*20):
         bunch_sort = [ bunch[i] for i in numpy.argsort([len(x['tokens_in']) for x in bunch]) ]
         for item in util.grouper(bunch_sort, self.batch_size):
             yield self.batcher.batch(item)
Example #5
0
def encode_images(model, imgs, batch_size=128, task=None):
    """Project imgs to the joint space using model.
    """
    if task is None:
        task = model.task
    return numpy.vstack([ task.encode_images(batch)
                          for batch in util.grouper(imgs, batch_size) ])
Example #6
0
def encode_sentences(model, audios, batch_size=128):
    """Project audios to the joint space using model.
    
    For each audio returns a vector.
    """
    return numpy.vstack([ model.task.predict(vector_padder(batch))
                            for batch in util.grouper(audios, batch_size) ])
Example #7
0
def predict_img(model, audios, batch_size=32):
    """Project sents to the visual space using model.
    
    For each sentence returns the predicted vector of visual features.
    """
    return numpy.vstack([ model.task.predict(vector_padder(batch))
                            for batch in util.grouper(audios, batch_size) ])
Example #8
0
def predict_img(model, sents, batch_size=128):
    """Project sents to the visual space using model.

    For each sentence returns the predicted vector of visual features.
    """
    inputs = list(model.batcher.mapper.transform(sents))
    return numpy.vstack([ model.visual.predict(model.batcher.batch_inp(batch))
                            for batch in util.grouper(inputs, batch_size) ])
Example #9
0
def encode_sentences(model, sents, batch_size=128):
    """Project sents to the joint space using model.
    
    For each sentence returns a vector.
    """
    inputs = list(model.batcher.mapper.transform(sents))
    return numpy.vstack([ model.task.predict(model.batcher.batch_inp(batch))
                            for batch in util.grouper(inputs, batch_size) ])
Example #10
0
def representation(model, sents, batch_size=128):
    """Project sents to hidden state space using model.
    
    For each sentence returns a vector corresponding the activation of the hidden layer 
    at the end-of-sentence symbol.
    """
    task = model.Visual
    inputs = list(model.batcher.mapper.transform(sents))
    return numpy.vstack([ task.representation(model.batcher.batch_inp(batch))[:,-1,:]
                            for batch in util.grouper(inputs, batch_size) ])
Example #11
0
def predict_img(model, sents, batch_size=128):
    """Project sents to the visual space using model.
    
    For each sentence returns the predicted vector of visual features.
    """
    task = model.Visual
    inputs = list(model.batcher.mapper.transform(sents))
    return numpy.vstack([
        task.predict(model.batcher.batch_inp(batch))
        for batch in util.grouper(inputs, batch_size)
    ])
Example #12
0
def representation(model, sents, batch_size=128):
    """Project sents to hidden state space using model.
    
    For each sentence returns a vector corresponding the activation of the hidden layer 
    at the end-of-sentence symbol.
    """
    task = model.Visual
    inputs = list(model.batcher.mapper.transform(sents))
    return numpy.vstack([
        task.representation(model.batcher.batch_inp(batch))[:, -1, :]
        for batch in util.grouper(inputs, batch_size)
    ])
Example #13
0
def cmd_train_rte(data_path='.',
                  size=200,
                  dropout=0.0,
                  lr=0.0002,
                  epochs=1,
                  batch_size=64,
                  model_path='.',
                  seed=None):
    sys.setrecursionlimit(50000)
    if seed is not None:
        random.seed(seed)
    classify_size = 3
    premise_r = numpy.load(
        os.path.join(data_path, "train_predict_premise_r.npy"))
    hypo_r = numpy.load(os.path.join(data_path, "train_predict_hypo_r.npy"))
    labels = onehot(
        numpy.load(os.path.join(data_path, "train_entailment_labels.npy")),
        classify_size)
    val_premise_r = numpy.load(
        os.path.join(data_path, "dev_predict_premise_r.npy"))
    val_hypo_r = numpy.load(os.path.join(data_path, "dev_predict_hypo_r.npy"))
    val_labels = onehot(
        numpy.load(os.path.join(data_path, "dev_entailment_labels.npy")),
        classify_size)
    size_repr = premise_r.shape[1]
    model = RTE(size_repr=size_repr, size_hidden=size, dropout=dropout, lr=lr)
    start_epoch = 1
    for epoch in range(start_epoch, epochs + 1):
        costs = Counter()
        for _j, item in enumerate(
                grouper(itertools.izip(premise_r, hypo_r, labels),
                        batch_size)):
            j = _j + 1
            premise, hypo, label = zip(*item)
            cost = model.train(premise, hypo, label)
            costs += Counter({'cost': cost, 'N': 1})
        costs_valid = valid_loss(model, val_premise_r, val_hypo_r, val_labels)

        print epoch, j, j * batch_size, "train", "ce", costs['cost'] / costs[
            'N']
        print epoch, j, j * batch_size, "valid", "ce", costs_valid[
            'cost'] / costs_valid['N']
        print epoch, j, j*batch_size, "valid", "ac", \
            metrics.accuracy_score(numpy.argmax(val_labels, axis=1),
                                   numpy.argmax(model.predict(val_premise_r, val_hypo_r), axis=1))


#        pickle.dump(model, gzip.open(os.path.join(model_path, "entailment_model.{}.pkl.gz".format(epoch)),'w'),
#                    protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(model,
                gzip.open(os.path.join(model_path, "entailment_model.pkl.gz"),
                          'w'),
                protocol=pickle.HIGHEST_PROTOCOL)
Example #14
0
def pile(model, sents, batch_size=128):
    """Project each symbol in each sentence in sents to hidden state spaces corresponding to layers using model.
    
    For each sentence returns a 3D tensor corresponding to the activations of the hidden layers at each 
    position in the sentence.
    """
    task = model.Visual
    lens = map(len, sents)
    inputs = list(model.batcher.mapper.transform(sents))
    rs = [ r for batch in util.grouper(inputs, batch_size)
               for r in task.pile(model.batcher.batch_inp(batch)) ]    
    return [ r[-l-1:,:,:] for (r,l) in zip(rs, lens) ]
Example #15
0
def cmd_predict_r(model_path='.', 
                  batch_size=128,
                  split='train',
                  output_premise='predict_premise_r.npy',
                  output_hypo='predict_hypo_r.npy',
                  output_labels='entailment_labels.npy'):
    def load(f):
        return pickle.load(gzip.open(os.path.join(model_path, f)))
    model_name = 'model.pkl.gz'
    batcher, scaler, model = map(load, ['batcher.pkl.gz','scaler.pkl.gz', model_name])
    mapper = batcher.mapper
    predict_r = predictor_r(model)
    sents_premise, sents_hypo, labels  = zip(*parse_snli(split=split))
    inputs_premise = list(mapper.transform(sents_premise))
    inputs_hypo    = list(mapper.transform(sents_hypo))
    preds_premise_r = numpy.vstack([ predict_r(batcher.batch_inp(batch))
                                     for batch in grouper(inputs_premise, batch_size) ])
    numpy.save(os.path.join(model_path, split + '_' + output_premise), preds_premise_r)
    preds_hypo_r = numpy.vstack([ predict_r(batcher.batch_inp(batch))
                                     for batch in grouper(inputs_hypo, batch_size) ])
    numpy.save(os.path.join(model_path, split + '_' + output_hypo), preds_hypo_r)
    numpy.save(os.path.join(model_path, split + '_' + output_labels), labels)
Example #16
0
def pile(model, sents, batch_size=128):
    """Project each symbol in each sentence in sents to hidden state spaces corresponding to layers using model.
    
    For each sentence returns a 3D tensor corresponding to the activations of the hidden layers at each 
    position in the sentence.
    """
    task = model.Visual
    lens = map(len, sents)
    inputs = list(model.batcher.mapper.transform(sents))
    rs = [
        r for batch in util.grouper(inputs, batch_size)
        for r in task.pile(model.batcher.batch_inp(batch))
    ]
    return [r[-l - 1:, :, :] for (r, l) in zip(rs, lens)]
Example #17
0
def cmd_predict_v(dataset='coco',
                  datapath='.',
                  model_path='.',
                  model_name='model.pkl.gz',
                  batch_size=128,
                  output_v='predict_v.npy',
                  output_r='predict_r.npy'):
    M = load(model_path, model_name=model_name)
    model = M['model']
    batcher = M['batcher']
    mapper = M['batcher'].mapper
    predict_v = predictor_v(model)
    predict_r = predictor_r(model)
    prov   = dp.getDataProvider(dataset, root=datapath)
    sents  = list(prov.iterSentences(split='val'))
    inputs = list(mapper.transform([tokens(sent, tokenizer=batcher.tokenizer) for sent in sents ]))
    print len(model.network.params())
    preds_v  = numpy.vstack([ predict_v(batcher.batch_inp(batch))
                            for batch in grouper(inputs, batch_size) ])
    numpy.save(os.path.join(model_path, output_v), preds_v)
    preds_r = numpy.vstack([ predict_r(batcher.batch_inp(batch))
                             for batch in grouper(inputs, batch_size) ])
    numpy.save(os.path.join(model_path, output_r), preds_r)
Example #18
0
def featurefile(dataset='flickr8k', chunksize=1000, kind='fbank', noisy=False):
    if kind == 'mfcc':
        extract = extract_mfcc
    elif kind == 'fbank':
        extract = extract_fbank
    else:
        raise "Invalid kind"
    infix = ".noisy" if noisy else ""
    for i,chunk in enumerate(util.grouper(gzip.open("/home/gchrupala/repos/reimaginet/data/{}/dataset{}.mp3.jsonl.gz".format(dataset, infix)),chunksize)):
        result = []
        for line in chunk:
            sent = json.loads(line)
            sound = decodemp3(base64.b64decode(sent['speech']))
            result.append(extract(sound))
        numpy.save("/home/gchrupala/repos/reimaginet/data/{}/dataset{}.{}.{}.npy".format(dataset,infix,kind,i), result)
Example #19
0
def cmd_train_rte(data_path='.',
                  size=200,
                  dropout=0.0,
                  lr=0.0002,
                  epochs=1,
                  batch_size=64,
                  model_path='.',
                  seed=None):
    sys.setrecursionlimit(50000)
    if seed is not None:
                random.seed(seed)
    classify_size = 3
    premise_r = numpy.load(os.path.join(data_path, "train_predict_premise_r.npy"))
    hypo_r    = numpy.load(os.path.join(data_path, "train_predict_hypo_r.npy"))
    labels    = onehot(numpy.load(os.path.join(data_path, "train_entailment_labels.npy")), classify_size)
    val_premise_r = numpy.load(os.path.join(data_path, "dev_predict_premise_r.npy"))
    val_hypo_r  = numpy.load(os.path.join(data_path, "dev_predict_hypo_r.npy"))
    val_labels = onehot(numpy.load(os.path.join(data_path, "dev_entailment_labels.npy")), classify_size)
    size_repr = premise_r.shape[1]
    model = RTE(size_repr=size_repr, size_hidden=size, dropout=dropout, lr=lr)
    start_epoch=1
    for epoch in range(start_epoch, epochs+1):
        costs = Counter()
        for _j,item in enumerate(grouper(itertools.izip(premise_r, hypo_r, labels), batch_size)):
            j = _j + 1
            premise, hypo, label = zip(*item)
            cost = model.train(premise, hypo, label)
            costs += Counter({'cost':cost, 'N':1})
        costs_valid = valid_loss(model, val_premise_r, val_hypo_r, val_labels)
        
        print epoch, j, j*batch_size, "train", "ce", costs['cost']/costs['N']
        print epoch, j, j*batch_size, "valid", "ce", costs_valid['cost']/costs_valid['N']
        print epoch, j, j*batch_size, "valid", "ac", \
            metrics.accuracy_score(numpy.argmax(val_labels, axis=1),
                                   numpy.argmax(model.predict(val_premise_r, val_hypo_r), axis=1))
#        pickle.dump(model, gzip.open(os.path.join(model_path, "entailment_model.{}.pkl.gz".format(epoch)),'w'),
#                    protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(model, gzip.open(os.path.join(model_path, "entailment_model.pkl.gz"),'w'),
                protocol=pickle.HIGHEST_PROTOCOL)
Example #20
0
 def iter_train_batches(self):
     for bunch in grouper(self.data['train'], self.batch_size*20):
         bunch_sort = [ bunch[i] for i in numpy.argsort([len(x) for x,_,_ in bunch]) ]
         for item in grouper(bunch_sort, self.batch_size):
             yield self.batcher.batch(item)
Example #21
0
def encode_images(model, imgs, batch_size=128):
    """Project imgs to the joint space using model.
    """
    return numpy.vstack([ model.task.encode_images(batch)
                          for batch in util.grouper(imgs, batch_size) ])
Example #22
0
def layer_states(model, audios, batch_size=128):
    """Pass audios through the model and for each audio return the state of each timestep and each layer."""
                             
    lens = (numpy.array(map(len, audios)) + model.config['filter_length']) // model.config['stride']
    rs = [ r for batch in util.grouper(audios, batch_size) for r in model.task.pile(vector_padder(batch)) ]
    return [ r[-l:,:,:] for (r,l) in zip(rs, lens) ]                                    
Example #23
0
 def iter_valid_batches(self):
     for bunch in util.grouper(self.data['valid'], self.batch_size*20):
         bunch_sort = [ bunch[i] for i in numpy.argsort([len(x['tokens_in']) for x in bunch]) ]
         for item in util.grouper(bunch_sort, self.batch_size):
             yield self.batcher.batch(item)