Python load_pretrained_word2vec Exemples, data_helpers.load_pretrained_word2vec Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : text_cnn.py Projet : 4ker/mxnet

def main():
    print('Loading data...')
    # word2vec = data_helpers.load_google_word2vec('data/GoogleNews-vectors-negative300.bin')
    word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
    x, y = data_helpers.load_data_with_word2vec(word2vec)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/dev set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    print('Train/Dev split: %d/%d' % (len(y_train), len(y_dev)))
    print('train shape:', x_train.shape)
    print('dev shape:', x_dev.shape)

    # reshpae for convolution input
    x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1], x_train.shape[2]))
    x_dev = np.reshape(x_dev, (x_dev.shape[0], 1, x_dev.shape[1], x_dev.shape[2]))

    num_embed = x_train.shape[-1]
    sentence_size = x_train.shape[2]
    print('sentence max words', sentence_size)
    print('embedding size', num_embed)
    batch_size = 50

    cnn_model = setup_cnn_model(mx.gpu(1), batch_size, sentence_size, num_embed, dropout=0.5)
    train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)

Exemple #2

0

Afficher le fichier

def main():
    print 'Loading data...'
    # word2vec = data_helpers.load_google_word2vec('data/GoogleNews-vectors-negative300.bin')
    word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
    x, y = data_helpers.load_data_with_word2vec(word2vec)


    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/dev set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    print 'Train/Dev split: %d/%d' % (len(y_train), len(y_dev))
    print 'train shape:', x_train.shape
    print 'dev shape:', x_dev.shape

    # reshpae for convolution input
    x_train = np.reshape(x_train, (x_train.shape[0], 1, x_train.shape[1], x_train.shape[2]))
    x_dev = np.reshape(x_dev, (x_dev.shape[0], 1, x_dev.shape[1], x_dev.shape[2]))

    num_embed = x_train.shape[-1]
    sentence_size = x_train.shape[2]
    print 'sentence max words', sentence_size
    print 'embedding size', num_embed
    batch_size = 50

    cnn_model = setup_cnn_model(mx.gpu(0), batch_size, sentence_size, num_embed, dropout=0.5)
    train_cnn(cnn_model, x_train, y_train, x_dev, y_dev, batch_size)

Exemple #3

0

Afficher le fichier

def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
    logger.info('Loading data...')
    if pre_trained_word2vec:
        word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
        x, y = data_helpers.load_data_with_word2vec(word2vec)
        # reshpae for convolution input
        x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2]))
        embed_size = x.shape[-1]
        sentence_size = x.shape[2]
        vocab_size = -1
    else:
        x, y, vocab, vocab_inv = data_helpers.load_data()
        embed_size = num_embed
        sentence_size = x.shape[1]
        vocab_size = len(vocab)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/valid set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    logger.info('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
    logger.info('train shape: %(shape)s', {'shape': x_train.shape})
    logger.info('valid shape: %(shape)s', {'shape': x_dev.shape})
    logger.info('sentence max words: %(shape)s', {'shape': sentence_size})
    logger.info('embedding size: %(msg)s', {'msg': embed_size})
    logger.info('vocab size: %(msg)s', {'msg': vocab_size})

    train = mx.io.NDArrayIter(x_train, y_train, batch_size, shuffle=True)
    valid = mx.io.NDArrayIter(x_dev, y_dev, batch_size)
    return (train, valid, sentence_size, embed_size, vocab_size)

Exemple #4

0

Afficher le fichier

def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
    """Construct data iter

    Parameters
    ----------
    batch_size: int
    num_embed: int
    pre_trained_word2vec: boolean
                        identify the pre-trained layers or not
    Returns
    ----------
    train_set: DataIter
                Train DataIter
    valid: DataIter
                Valid DataIter
    sentences_size: int
                array dimensions
    embedded_size: int
                array dimensions
    vocab_size: int
                array dimensions
    """
    print('Loading data...')
    if pre_trained_word2vec:
        word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
        x, y = data_helpers.load_data_with_word2vec(word2vec)
        # reshape for convolution input
        x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2]))
        embedded_size = x.shape[-1]
        sentences_size = x.shape[2]
        vocabulary_size = -1
    else:
        x, y, vocab, vocab_inv = data_helpers.load_data()
        embedded_size = num_embed
        sentences_size = x.shape[1]
        vocabulary_size = len(vocab)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/valid set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
    print('train shape:', x_train.shape)
    print('valid shape:', x_dev.shape)
    print('sentence max words', sentences_size)
    print('embedding size', embedded_size)
    print('vocab size', vocabulary_size)

    train_set = mx.io.NDArrayIter(
        x_train, y_train, batch_size, shuffle=True)
    valid = mx.io.NDArrayIter(
        x_dev, y_dev, batch_size)

    return train_set, valid, sentences_size, embedded_size, vocabulary_size

Exemple #5

0

Afficher le fichier

Fichier : text_cnn.py Projet : Piyush3dB/mxnet

def data_iter(batch_size, num_embed, pre_trained_word2vec=False):
    print('Loading data...')
    if pre_trained_word2vec:
        word2vec = data_helpers.load_pretrained_word2vec('data/rt.vec')
        x, y = data_helpers.load_data_with_word2vec(word2vec)
        # reshpae for convolution input
        x = np.reshape(x, (x.shape[0], 1, x.shape[1], x.shape[2]))
        embed_size = x.shape[-1]
        sentence_size = x.shape[2]
        vocab_size = -1
    else:
        x, y, vocab, vocab_inv = data_helpers.load_data()
        embed_size = num_embed
        sentence_size = x.shape[1]
        vocab_size = len(vocab)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # split train/valid set
    x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:]
    y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:]
    print('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
    print('train shape:', x_train.shape)
    print('valid shape:', x_dev.shape)
    print('sentence max words', sentence_size)
    print('embedding size', embed_size)
    print('vocab size', vocab_size)

    train = mx.io.NDArrayIter(
        x_train, y_train, batch_size, shuffle=True)
    valid = mx.io.NDArrayIter(
        x_dev, y_dev, batch_size)
    
    return (train, valid, sentence_size, embed_size, vocab_size)

Exemple #6

0

Afficher le fichier

                m.cnn_exec.forward(is_train=False)

                num_correct += sum(batchY == np.argmax(m.cnn_exec.outputs[0].asnumpy(), axis=1))
                num_total += len(batchY)

            dev_acc = num_correct * 100 / float(num_total)
            print('Iter [%d] Train: Time: %.3fs, Training Accuracy: %.3f \
                    --- Dev Accuracy thus far: %.3f' % (iteration, train_time, train_acc, dev_acc), file=logs)


if __name__ == '__main__':
    mix_model = mix_cnn()
    print('Loading data...')
    # word2vec = data_helpers.load_google_word2vec('/Users/guo/TrainData/google300/GoogleNews-vectors-negative300.bin')

    word2vec = data_helpers.load_pretrained_word2vec('VecForMR_.txt')
    sentences, labels = data_helpers.load_data_and_labels()
    sentences_padded = data_helpers.pad_sentences(sentences)
    x, y = data_helpers.build_input_data_with_word2vec(sentences_padded, labels, word2vec)
    mix_model.dic = data_helpers.buildGram(sentences, min1=6, min2=7)
    mix_model.initTheta()
    x_sent, mix_model.idf = data_helpers.buildDocsTFIDF(mix_model.dic, sentences)
    x_sent = np.array(x_sent)
    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    cv = 1
    cv_length = len(y)/10
    sample_test = shuffle_indices[cv_length * cv:cv_length * (cv + 1)]
    sample_train = np.concatenate((shuffle_indices[:cv_length * cv], shuffle_indices[cv_length * (cv + 1):]))