Ejemplo n.º 1
0
def encode_and_pad_sentences(vocab, sentences, max_sents_per_image,
                             max_sent_len):
    """Encodes and pads sentences.

  Args:
    vocab: a dict mapping from word to id.
    sentences: a list of python string.
    max_sents_per_image: maximum number of sentences.
    max_sent_len: maximum length of sentence.

  Returns:
    num_sents: a integer denoting the number of sentences.
    sent_mat: a [max_sents_per_image, max_sent_len] numpy array pad with zero.
    sent_len: a [max_sents_per_image] numpy array indicating the length of each
      sentence in the matrix.
  """
    encode_fn = lambda x: [vocab.get(w, 0) for w in tokenize(x)]

    sentences = [encode_fn(s) for s in sentences]
    sent_mat = np.zeros((max_sents_per_image, max_sent_len), np.int32)
    sent_len = np.zeros((max_sents_per_image, ), np.int32)

    for index, sent in enumerate(sentences[:max_sents_per_image]):
        sent_len[index] = min(max_sent_len, len(sent))
        sent_mat[index][:sent_len[index]] = sent[:sent_len[index]]

    return len(sentences), sent_mat, sent_len
Ejemplo n.º 2
0
def _check_coverage(vocab, sentences):
    """Checks the coverage of the vocabulary.

  Args:
    vocab: a dict mapping from word to anything.
    sentences: a list of sentences.
  """
    uncover = 0
    for sentence in sentences:
        for word in tokenize(sentence):
            if not word in vocab:
                uncover += 1
                break
    print >> sys.stderr, 'Vocab coverage: %.4lf' % (
        1.0 - 1.0 * uncover / len(sentences))
Ejemplo n.º 3
0
def _create_vocab(sentences, min_count=2):
    """Computes the vocab given the corpus.

  Args:
    sentences: a list of strings.
    min_count: words appear less than min_count would be pruned.

  Returns:
    vocab: a dict mapping from word to frequency.
  """
    vocab = {}
    for sentence in sentences:
        for word in tokenize(sentence):
            vocab[word] = vocab.get(word, 0) + 1

    print >> sys.stderr, 'Number of words: %i.' % (len(vocab))
    for k in vocab.keys():
        if vocab[k] < min_count:
            del vocab[k]
    print >> sys.stderr, 'Number of words after pruning: %i.' % (len(vocab))
    return vocab