def train(args): corpus = load_corpus(args.input) n_vocab, docs = len(corpus['vocab']), corpus['docs'] corpus.clear() # save memory doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] if args.noise == 'gs': X_docs_noisy = add_gaussian_noise(X_docs, 0.1) elif args.noise == 'sp': X_docs_noisy = add_salt_pepper_noise(X_docs, 0.1) pass elif args.noise == 'mn': X_docs_noisy = add_masking_noise(X_docs, 0.01) else: pass n_samples = X_docs.shape[0] np.random.seed(0) val_idx = np.random.choice(range(n_samples), args.n_val, replace=False) train_idx = list(set(range(n_samples)) - set(val_idx)) X_train = X_docs[train_idx] X_val = X_docs[val_idx] del X_docs if args.noise: # X_train_noisy = X_docs_noisy[:-n_val] # X_val_noisy = X_docs_noisy[-n_val:] X_train_noisy = X_docs_noisy[train_idx] X_val_noisy = X_docs_noisy[val_idx] print 'added %s noise' % args.noise else: X_train_noisy = X_train X_val_noisy = X_val start = timeit.default_timer() ae = AutoEncoder(n_vocab, args.n_dim, comp_topk=args.comp_topk, ctype=args.ctype, save_model=args.save_model) ae.fit([X_train_noisy, X_train], [X_val_noisy, X_val], nb_epoch=args.n_epoch, \ batch_size=args.batch_size, contractive=args.contractive) print 'runtime: %ss' % (timeit.default_timer() - start) if args.output: train_doc_codes = ae.encoder.predict(X_train) val_doc_codes = ae.encoder.predict(X_val) doc_keys = np.array(doc_keys) dump_json( dict(zip(doc_keys[train_idx].tolist(), train_doc_codes.tolist())), args.output + '.train') dump_json( dict(zip(doc_keys[val_idx].tolist(), val_doc_codes.tolist())), args.output + '.val') print 'Saved doc codes file to %s and %s' % (args.output + '.train', args.output + '.val')
def train(args): corpus = load_corpus(args.input) n_vocab, docs = len(corpus['vocab']), corpus['docs'] corpus.clear() # vocab = corpus['vocab'] corpus.clear() # save memory doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] if args.noise == 'gs': X_docs_noisy = add_gaussian_noise(X_docs, 0.1) elif args.noise == 'sp': X_docs_noisy = add_salt_pepper_noise(X_docs, 0.1) pass elif args.noise == 'mn': X_docs_noisy = add_masking_noise(X_docs, 0.01) else: pass n_samples = X_docs.shape[0] np.random.seed(0) val_idx = np.random.choice(range(n_samples), args.n_val, replace=False) train_idx = list(set(range(n_samples)) - set(val_idx)) X_train = X_docs[train_idx] X_val = X_docs[val_idx] del X_docs if args.noise: # X_train_noisy = X_docs_noisy[:-n_val] # X_val_noisy = X_docs_noisy[-n_val:] X_train_noisy = X_docs_noisy[train_idx] X_val_noisy = X_docs_noisy[val_idx] print 'added %s noise' % args.noise else: X_train_noisy = X_train X_val_noisy = X_val start = timeit.default_timer() ae = AutoEncoder(n_vocab, args.n_dim, comp_topk=args.comp_topk, ctype=args.ctype, save_model=args.save_model) ae.fit([X_train_noisy, X_train], [X_val_noisy, X_val], nb_epoch=args.n_epoch, \ batch_size=args.batch_size, contractive=args.contractive) print 'runtime: %ss' % (timeit.default_timer() - start) if args.output: train_doc_codes = ae.encoder.predict(X_train) val_doc_codes = ae.encoder.predict(X_val) doc_keys = np.array(doc_keys) dump_json( dict(zip(doc_keys[train_idx].tolist(), train_doc_codes.tolist())), args.output + '.train') dump_json( dict(zip(doc_keys[val_idx].tolist(), val_doc_codes.tolist())), args.output + '.val') print 'Saved doc codes file to %s and %s' % (args.output + '.train', args.output + '.val') def unitmatrix(matrix, norm='l2', axis=1): if norm == 'l1': maxtrixlen = np.sum(np.abs(matrix), axis=axis) if norm == 'l2': maxtrixlen = np.linalg.norm(matrix, axis=axis) if np.any(maxtrixlen <= 0): return matrix else: maxtrixlen = maxtrixlen.reshape( 1, len(maxtrixlen)) if axis == 0 else maxtrixlen.reshape( len(maxtrixlen), 1) return matrix / maxtrixlen def calc_pairwise_dev(weights): # the average squared deviation from 0 (90 degree) weights = unitmatrix(weights, axis=0) # normalize n = weights.shape[1] score = 0. for i in range(n): for j in range(i + 1, n): score += (weights[:, i].dot(weights[:, j]))**2 return np.sqrt(2. * score / n / (n - 1)) from keras.models import load_model