def train(args):
    corpus = load_corpus(args.input)
    n_vocab, docs = len(corpus['vocab']), corpus['docs']
    corpus.clear()  # save memory
    doc_keys = docs.keys()
    X_docs = []
    for k in doc_keys:
        X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0))
        del docs[k]
    X_docs = np.r_[X_docs]

    if args.noise == 'gs':
        X_docs_noisy = add_gaussian_noise(X_docs, 0.1)
    elif args.noise == 'sp':
        X_docs_noisy = add_salt_pepper_noise(X_docs, 0.1)
        pass
    elif args.noise == 'mn':
        X_docs_noisy = add_masking_noise(X_docs, 0.01)
    else:
        pass

    n_samples = X_docs.shape[0]
    np.random.seed(0)
    val_idx = np.random.choice(range(n_samples), args.n_val, replace=False)
    train_idx = list(set(range(n_samples)) - set(val_idx))
    X_train = X_docs[train_idx]
    X_val = X_docs[val_idx]
    del X_docs

    if args.noise:
        # X_train_noisy = X_docs_noisy[:-n_val]
        # X_val_noisy = X_docs_noisy[-n_val:]
        X_train_noisy = X_docs_noisy[train_idx]
        X_val_noisy = X_docs_noisy[val_idx]
        print 'added %s noise' % args.noise
    else:
        X_train_noisy = X_train
        X_val_noisy = X_val

    start = timeit.default_timer()

    ae = AutoEncoder(n_vocab,
                     args.n_dim,
                     comp_topk=args.comp_topk,
                     ctype=args.ctype,
                     save_model=args.save_model)
    ae.fit([X_train_noisy, X_train], [X_val_noisy, X_val], nb_epoch=args.n_epoch, \
            batch_size=args.batch_size, contractive=args.contractive)

    print 'runtime: %ss' % (timeit.default_timer() - start)

    if args.output:
        train_doc_codes = ae.encoder.predict(X_train)
        val_doc_codes = ae.encoder.predict(X_val)
        doc_keys = np.array(doc_keys)
        dump_json(
            dict(zip(doc_keys[train_idx].tolist(), train_doc_codes.tolist())),
            args.output + '.train')
        dump_json(
            dict(zip(doc_keys[val_idx].tolist(), val_doc_codes.tolist())),
            args.output + '.val')
        print 'Saved doc codes file to %s and %s' % (args.output + '.train',
                                                     args.output + '.val')
Esempio n. 2
0
def train(args):
    corpus = load_corpus(args.input)
    n_vocab, docs = len(corpus['vocab']), corpus['docs']
    corpus.clear()
    # vocab = corpus['vocab']
    corpus.clear()  # save memory
    doc_keys = docs.keys()
    X_docs = []
    for k in doc_keys:
        X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0))
        del docs[k]
    X_docs = np.r_[X_docs]

    if args.noise == 'gs':
        X_docs_noisy = add_gaussian_noise(X_docs, 0.1)
    elif args.noise == 'sp':
        X_docs_noisy = add_salt_pepper_noise(X_docs, 0.1)
        pass
    elif args.noise == 'mn':
        X_docs_noisy = add_masking_noise(X_docs, 0.01)
    else:
        pass

    n_samples = X_docs.shape[0]
    np.random.seed(0)
    val_idx = np.random.choice(range(n_samples), args.n_val, replace=False)
    train_idx = list(set(range(n_samples)) - set(val_idx))
    X_train = X_docs[train_idx]
    X_val = X_docs[val_idx]
    del X_docs

    if args.noise:
        # X_train_noisy = X_docs_noisy[:-n_val]
        # X_val_noisy = X_docs_noisy[-n_val:]
        X_train_noisy = X_docs_noisy[train_idx]
        X_val_noisy = X_docs_noisy[val_idx]
        print 'added %s noise' % args.noise
    else:
        X_train_noisy = X_train
        X_val_noisy = X_val

    start = timeit.default_timer()

    ae = AutoEncoder(n_vocab,
                     args.n_dim,
                     comp_topk=args.comp_topk,
                     ctype=args.ctype,
                     save_model=args.save_model)
    ae.fit([X_train_noisy, X_train], [X_val_noisy, X_val], nb_epoch=args.n_epoch, \
            batch_size=args.batch_size, contractive=args.contractive)

    print 'runtime: %ss' % (timeit.default_timer() - start)

    if args.output:
        train_doc_codes = ae.encoder.predict(X_train)
        val_doc_codes = ae.encoder.predict(X_val)
        doc_keys = np.array(doc_keys)
        dump_json(
            dict(zip(doc_keys[train_idx].tolist(), train_doc_codes.tolist())),
            args.output + '.train')
        dump_json(
            dict(zip(doc_keys[val_idx].tolist(), val_doc_codes.tolist())),
            args.output + '.val')
        print 'Saved doc codes file to %s and %s' % (args.output + '.train',
                                                     args.output + '.val')

    def unitmatrix(matrix, norm='l2', axis=1):
        if norm == 'l1':
            maxtrixlen = np.sum(np.abs(matrix), axis=axis)
        if norm == 'l2':
            maxtrixlen = np.linalg.norm(matrix, axis=axis)

        if np.any(maxtrixlen <= 0):
            return matrix
        else:
            maxtrixlen = maxtrixlen.reshape(
                1, len(maxtrixlen)) if axis == 0 else maxtrixlen.reshape(
                    len(maxtrixlen), 1)
            return matrix / maxtrixlen

    def calc_pairwise_dev(weights):
        # the average squared deviation from 0 (90 degree)
        weights = unitmatrix(weights, axis=0)  # normalize
        n = weights.shape[1]
        score = 0.
        for i in range(n):
            for j in range(i + 1, n):
                score += (weights[:, i].dot(weights[:, j]))**2

        return np.sqrt(2. * score / n / (n - 1))

    from keras.models import load_model