Esempio n. 1
0
def load_data_as_sentences(path, word_to_num):
    """
    Converts the training data to an array of integer arrays.
      args: 
        path: string pointing to the training data
        word_to_num: A dictionary from string words to integers
      returns:
        An array of integer arrays. Each array is a sentence and each 
        integer is a word.
    """
    docs_data = du.load_dataset(path)
    S_data = du.docs_to_indices(docs_data, word_to_num)
    return docs_data, S_data
Esempio n. 2
0
vocabsize = 2000
num_to_word = dict(enumerate(vocab.index[:vocabsize]))
word_to_num = du.invert_dict(num_to_word)
##
# Below needed for 'adj_loss': DO NOT CHANGE
fraction_lost = float(sum([vocab['count'][word] for word in vocab.index
                           if (not word in word_to_num)
                               and (not word == "UUUNKKK")]))
fraction_lost /= sum([vocab['count'][word] for word in vocab.index
                      if (not word == "UUUNKKK")])
print "Retained %d words from %d (%.02f%% of all tokens)" % (vocabsize, len(vocab),
                                                             100*(1-fraction_lost))

# Load the training set
docs = du.load_dataset('data/lm/ptb-train.txt')
S_train = du.docs_to_indices(docs, word_to_num)
X_train, Y_train = du.seqs_to_lmXY(S_train)

# Load the dev set (for tuning hyperparameters)
docs = du.load_dataset('data/lm/ptb-dev.txt')
S_dev = du.docs_to_indices(docs, word_to_num)
X_dev, Y_dev = du.seqs_to_lmXY(S_dev)

# Load the test set (final evaluation only)
docs = du.load_dataset('data/lm/ptb-test.txt')
S_test = du.docs_to_indices(docs, word_to_num)
X_test, Y_test = du.seqs_to_lmXY(S_test)

# Display some sample data
#print " ".join(d[0] for d in docs[7])
#print S_test[7]
Esempio n. 3
0
vocab = pd.read_table(
    "data/lm/vocab.ptb.txt",
    header=None,
    sep="\s+",
    index_col=0,
    names=['count', 'freq'],
)

# Choose how many top words to keep
vocabsize = 2000
num_to_word = dict(enumerate(vocab.index[:vocabsize]))
word_to_num = du.invert_dict(num_to_word)

# Load the training set
docs_train = du.load_dataset('data/lm/ptb-train.txt')
S_train = du.docs_to_indices(docs_train, word_to_num)
docs_dev = du.load_dataset('data/lm/ptb-dev.txt')
S_dev = du.docs_to_indices(docs_dev, word_to_num)


def train_ngrams(dataset):
    """
        Gets an array of arrays of indexes, each one corresponds to a word.
        Returns trigram, bigram, unigram and total counts.
    """
    trigram_counts = dict()
    bigram_counts = dict()
    unigram_counts = dict()
    token_count = 0
    ### YOUR CODE HERE
Esempio n. 4
0
    num_to_word = dict(enumerate(vocab.index[:vocabsize]))
    word_to_num = du.invert_dict(num_to_word)

    fraction_lost = float(
        sum([
            vocab['count'][word] for word in vocab.index
            if (not word in word_to_num) and (not word == "UUUNKKK")
        ]))
    fraction_lost /= sum([
        vocab['count'][word] for word in vocab.index if (not word == "UUUNKKK")
    ])
    print "Retained %d words from %d (%.02f%% of all tokens)" % (
        vocabsize, len(vocab), 100 * (1 - fraction_lost))

    docs = du.load_dataset('data/lm/ptb-train.txt')
    S_train = du.docs_to_indices(docs, word_to_num)
    X_train, Y_train = du.seqs_to_lmXY(S_train)

    docs = du.load_dataset('data/lm/ptb-dev.txt')
    S_dev = du.docs_to_indices(docs, word_to_num)
    X_dev, Y_dev = du.seqs_to_lmXY(S_dev)

    docs = du.load_dataset('data/lm/ptb-test.txt')
    S_test = du.docs_to_indices(docs, word_to_num)
    X_test, Y_test = du.seqs_to_lmXY(S_test)

    #print " ".join(d[0] for d in docs[7])
    #print " ".join(num_to_word[i] for i in S_test[7])

    #For random samples from N(mu, sigma^2), use:
    #    sigma * np.random.randn(...) + mu