def train(args): corpus = load_corpus(args.input) n_vocab, docs = len(corpus['vocab']), corpus['docs'] corpus.clear() # save memory X_docs = [] for k in docs.keys(): X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] np.random.seed(0) np.random.shuffle(X_docs) # X_docs_noisy = corrupted_matrix(np.r_[X_docs], 0.1) n_val = args.n_val # X_train = np.r_[X_docs[:-n_val]] # X_val = np.r_[X_docs[-n_val:]] X_train = np.r_[X_docs[:-n_val]] del X_docs[:-n_val] X_val = np.r_[X_docs] del X_docs start = timeit.default_timer() vae = VarAutoEncoder(n_vocab, args.n_dim, comp_topk=args.comp_topk, ctype=args.ctype, save_model=args.save_model) vae.fit([X_train, X_train], [X_val, X_val], nb_epoch=args.n_epoch, batch_size=args.batch_size) print 'runtime: %ss' % (timeit.default_timer() - start)
def test(args): corpus = load_corpus(args.input) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] ae = load_ae_model(args.load_model) doc_codes = ae.predict(X_docs) dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output) print 'Saved doc codes file to %s' % args.output if args.save_topics: topics_strength = get_topics_strength(ae, revdict(vocab), topn=50) print_topics(topics_strength) # save_topics_strength(topics_strength, args.save_topics) save_chinese_topics_strength(topics_strength, args.save_topics) # topics = get_topics(ae, revdict(vocab), topn=10) # write_file(topics, args.save_topics) print 'Saved topics file to %s' % args.save_topics if args.word_clouds: queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading', 'tax', 'march', 'april', 'june', 'july'] weights = ae.get_weights()[0] weights = unitmatrix(weights) # normalize word_cloud(weights, vocab, queries, save_file=args.word_clouds) print 'Saved word clouds file to %s' % args.word_clouds if args.sample_words: revocab = revdict(vocab) queries = ['weapon', 'christian', 'compani', 'israel', 'law', 'hockey', 'comput', 'space'] words = [] for each in queries: if each in vocab: words.append(get_similar_words(ae, vocab[each], revocab, topn=11)) write_file(words, args.sample_words) print 'Saved sample words file to %s' % args.sample_words if args.translate_words: revocab = revdict(vocab) queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']] for each in queries: print each print translate_words(ae, each, vocab, revocab, topn=10) if args.calc_distinct: # mean, std = calc_pairwise_cosine(ae) # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi) sd = calc_pairwise_dev(ae) print 'Average squared deviation from 0 (90 degree): %s' % sd
def test(args): corpus = load_corpus(args.input) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] model = AutoEncoder # model = DeepAutoEncoder ae = load_model(model, args.load_arch, args.load_weights) doc_codes = ae.encoder.predict(X_docs) dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output) print('Saved doc codes file to %s' % args.output) if args.save_topics: topics_strength = get_topics_strength(ae, revdict(vocab), topn=10) save_topics_strength(topics_strength, args.save_topics) # topics = get_topics(ae, revdict(vocab), topn=10) # write_file(topics, args.save_topics) print('Saved topics file to %s' % args.save_topics) if args.sample_words: revocab = revdict(vocab) queries = [ 'weapon', 'christian', 'compani', 'israel', 'law', 'hockey', 'comput', 'space' ] words = [] for each in queries: words.append(get_similar_words(ae, vocab[each], revocab, topn=11)) write_file(words, args.sample_words) print('Saved sample words file to %s' % args.sample_words) if args.translate_words: revocab = revdict(vocab) queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']] for each in queries: print(each) print(translate_words(ae, each, vocab, revocab, topn=10)) if args.calc_distinct: # mean, std = calc_pairwise_cosine(ae) # print('Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi)) sd = calc_pairwise_dev(ae) print('Average squared deviation from 0 (90 degree): %s' % sd)
def test(args): corpus = load_corpus(args.input) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] vae = load_vae_model(args.load_model) doc_codes = vae.predict(X_docs) dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output) print 'Saved doc codes file to %s' % args.output
def train(args): corpus = load_corpus(args.input) n_vocab, docs = len(corpus['vocab']), corpus['docs'] corpus.clear() # save memory doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] if args.noise == 'gs': X_docs_noisy = add_gaussian_noise(X_docs, 0.1) elif args.noise == 'sp': X_docs_noisy = add_salt_pepper_noise(X_docs, 0.1) pass elif args.noise == 'mn': X_docs_noisy = add_masking_noise(X_docs, 0.01) else: pass n_samples = X_docs.shape[0] np.random.seed(0) val_idx = np.random.choice(range(n_samples), args.n_val, replace=False) train_idx = list(set(range(n_samples)) - set(val_idx)) X_train = X_docs[train_idx] X_val = X_docs[val_idx] del X_docs if args.noise: # X_train_noisy = X_docs_noisy[:-n_val] # X_val_noisy = X_docs_noisy[-n_val:] X_train_noisy = X_docs_noisy[train_idx] X_val_noisy = X_docs_noisy[val_idx] print 'added %s noise' % args.noise else: X_train_noisy = X_train X_val_noisy = X_val start = timeit.default_timer() ae = AutoEncoder(n_vocab, args.n_dim, comp_topk=args.comp_topk, ctype=args.ctype, save_model=args.save_model) ae.fit([X_train_noisy, X_train], [X_val_noisy, X_val], nb_epoch=args.n_epoch, \ batch_size=args.batch_size, contractive=args.contractive) print 'runtime: %ss' % (timeit.default_timer() - start) if args.output: train_doc_codes = ae.encoder.predict(X_train) val_doc_codes = ae.encoder.predict(X_val) doc_keys = np.array(doc_keys) dump_json( dict(zip(doc_keys[train_idx].tolist(), train_doc_codes.tolist())), args.output + '.train') dump_json( dict(zip(doc_keys[val_idx].tolist(), val_doc_codes.tolist())), args.output + '.val') print 'Saved doc codes file to %s and %s' % (args.output + '.train', args.output + '.val')
def test(args): corpus = load_corpus(args.input) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_keys = list(docs.keys()) X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] ae = load_ae_model(args.load_model) doc_codes = ae.predict(X_docs) dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output) print('Saved doc codes file to %s' % args.output) if args.save_topics: topics_strength = get_topics_strength(ae, revdict(vocab), topn=10) save_topics_strength(topics_strength, args.save_topics) # topics = get_topics(ae, revdict(vocab), topn=10) # write_file(topics, args.save_topics) print('Saved topics file to %s' % args.save_topics) if args.word_clouds: queries = [ 'interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading', 'tax', 'march', 'april', 'june', 'july' ] weights = ae.get_weights()[0] weights = unitmatrix(weights) # normalize word_cloud(weights, vocab, queries, save_file=args.word_clouds) print('Saved word clouds file to %s' % args.word_clouds) if args.sample_words: revocab = revdict(vocab) while True: print("----------------------------\n? ", end='') sys.stdout.flush() query = sys.stdin.readline() query = re.sub(r'[^\w\s-]', ' ', query) # remove punctuations except hyphen query_words = [] for word in query.lower().split(): # convert to lowercase if word not in stopwords.words('english'): # remove stop words query_words.append(word) # ===== make the query length to be (32) = times_steps size """long_enough = False while not long_enough: for word in query_words: query_vectors.append(word2vec_map[word]) if len(query_vectors) == 32: long_enough = True break""" words = [] for each in query_words: words.append( get_similar_words(ae, vocab[each], revocab, topn=11)) write_file(words, args.sample_words) print('Saved sample words file to %s' % args.sample_words) if args.translate_words: revocab = revdict(vocab) queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']] for each in queries: print(each) print(translate_words(ae, each, vocab, revocab, topn=10)) if args.calc_distinct: # mean, std = calc_pairwise_cosine(ae) # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi) sd = calc_pairwise_dev(ae) print('Average squared deviation from 0 (90 degree): %s' % sd)
def learn_embedding(self, graph=None, edge_f=None, is_weighted=False, no_python=False, path_output="", dataset=""): n_dim = self._d method = "sdne" input = path_output + '/train.corpus' path_graph_embedding = path_source + "embedding/" + dataset + "/embedding_gem_sdne_" + dataset + "_" + str( n_dim) + ".txt" path_graph_embedding_id = path_source + "embedding/" + dataset + "/id_gem_" + method + "_" + dataset + "_" + str( n_dim) + ".txt" save_model = 'model' optimizer = "adadelta" val_split = 0.0214 batch_size = self._batch_size comp_topk = self._comp_topk optimizer = self._optimizer lr = self._lr alpha = self._alpha kfactor = self._kfactor gamma = self._gamma select_diff = self._select_diff select_loss = self._select_loss select_graph_np_diff = self._select_graph_np_diff contractive = None ctype = "kcomp" n_dim = 128 nb_epoch = 1000 save_model = 'model' if not graph and not edge_f: raise Exception('graph/edge_f needed') if not graph: graph = graph_util.loadGraphFromEdgeListTxt(edge_f) num_nodes = graph.number_of_nodes() graph3 = nx.DiGraph() graph3.add_nodes_from(range(0, num_nodes)) f1 = csv.reader(open(edge_f, "r"), delimiter=' ') for x, y in f1: # print(x,y) graph3.add_edge(int(x), int(y)) S = nx.to_scipy_sparse_matrix(graph, nodelist=sorted(graph.nodes())) t1 = time() S = (S + S.T) / 2 node_num = graph.number_of_nodes() edges_num = graph.number_of_edges() dict_nodes = {k: v for v, k in enumerate(sorted(graph.nodes()))} ## Load Graph Embeddings if (path_graph_embedding.endswith(".txt")): print("Loading SDNE embeddings") graph_embeddings = np.loadtxt(path_graph_embedding, delimiter=',') with open(path_graph_embedding_id) as temp_file: graph_embedding_id = [line.rstrip('\n') for line in temp_file] dict_graph = {k: v for v, k in enumerate(graph_embedding_id)} else: raise Exception('sdne embeddings do not exist') graph_embeddings = pickle.load(open(path_graph_embedding, "rb")) ## Load text data print("Loading textual corpus") corpus = load_corpus(input) n_vocab = len(corpus['vocab']) docs = corpus['docs'] corpus.clear() # save memory doc_keys = np.array(list(docs)) dict_doc = {int(k): v for v, k in enumerate((doc_keys))} X_docs = [] for k in list(docs): X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] # dump_json(dict(zip(doc_keys.tolist(), X_docs.tolist())), path_source+'embedding\\'+dataset+'\\bow.txt') text_vector = self.get_node_representation(graph, X_docs, dict_doc) graph_vector = self.get_node_representation(graph, graph_embeddings, dict_nodes) # return S,node_num,edges_num,graph_embeddings, X_docs,n_vocab, doc_keys, text_vector, graph_vector train_data = [text_vector, text_vector, graph_vector] result, _Y, model = fit_quadruple_hyperas(n_vocab, n_dim, comp_topk=comp_topk, ctype=ctype, save_model=save_model, kfactor=kfactor, alpha=alpha, gamma=gamma, num_nodes=node_num, num_edges=edges_num, train_data=train_data, test_data=X_docs, val_split=val_split, nb_epoch=nb_epoch, \ batch_size=batch_size, contractive=contractive, optimizer=optimizer, lr=lr, select_diff=select_diff, select_loss=select_loss, select_graph_np_diff=select_graph_np_diff) dump_json(dict(zip(doc_keys.tolist(), _Y.tolist())), path_source + 'embedding\\' + dataset + '\\predicted_cage_embedding.txt') print('Saved doc codes file') self._Y = _Y self._node_num = node_num self._X = X_docs _Y_id = doc_keys.tolist() return _Y, _Y_id, len(result.history["loss"]), t1
def train(args): corpus = load_corpus(args.input) n_vocab, docs = len(corpus['vocab']), corpus['docs'] corpus.clear() # vocab = corpus['vocab'] corpus.clear() # save memory doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] if args.noise == 'gs': X_docs_noisy = add_gaussian_noise(X_docs, 0.1) elif args.noise == 'sp': X_docs_noisy = add_salt_pepper_noise(X_docs, 0.1) pass elif args.noise == 'mn': X_docs_noisy = add_masking_noise(X_docs, 0.01) else: pass n_samples = X_docs.shape[0] np.random.seed(0) val_idx = np.random.choice(range(n_samples), args.n_val, replace=False) train_idx = list(set(range(n_samples)) - set(val_idx)) X_train = X_docs[train_idx] X_val = X_docs[val_idx] del X_docs if args.noise: # X_train_noisy = X_docs_noisy[:-n_val] # X_val_noisy = X_docs_noisy[-n_val:] X_train_noisy = X_docs_noisy[train_idx] X_val_noisy = X_docs_noisy[val_idx] print 'added %s noise' % args.noise else: X_train_noisy = X_train X_val_noisy = X_val start = timeit.default_timer() ae = AutoEncoder(n_vocab, args.n_dim, comp_topk=args.comp_topk, ctype=args.ctype, save_model=args.save_model) ae.fit([X_train_noisy, X_train], [X_val_noisy, X_val], nb_epoch=args.n_epoch, \ batch_size=args.batch_size, contractive=args.contractive) print 'runtime: %ss' % (timeit.default_timer() - start) if args.output: train_doc_codes = ae.encoder.predict(X_train) val_doc_codes = ae.encoder.predict(X_val) doc_keys = np.array(doc_keys) dump_json( dict(zip(doc_keys[train_idx].tolist(), train_doc_codes.tolist())), args.output + '.train') dump_json( dict(zip(doc_keys[val_idx].tolist(), val_doc_codes.tolist())), args.output + '.val') print 'Saved doc codes file to %s and %s' % (args.output + '.train', args.output + '.val') def unitmatrix(matrix, norm='l2', axis=1): if norm == 'l1': maxtrixlen = np.sum(np.abs(matrix), axis=axis) if norm == 'l2': maxtrixlen = np.linalg.norm(matrix, axis=axis) if np.any(maxtrixlen <= 0): return matrix else: maxtrixlen = maxtrixlen.reshape( 1, len(maxtrixlen)) if axis == 0 else maxtrixlen.reshape( len(maxtrixlen), 1) return matrix / maxtrixlen def calc_pairwise_dev(weights): # the average squared deviation from 0 (90 degree) weights = unitmatrix(weights, axis=0) # normalize n = weights.shape[1] score = 0. for i in range(n): for j in range(i + 1, n): score += (weights[:, i].dot(weights[:, j]))**2 return np.sqrt(2. * score / n / (n - 1)) from keras.models import load_model