]) total_counts = sum(word_counts[ele] for ele in word_counts) total_counts -= word_counts[vocabulary_inv_list[0]] background_array = np.zeros(vocab_sz) for i in range(1, vocab_sz): background_array[i] = word_counts[vocabulary_inv[i]] / total_counts seed_docs, seed_label = pseudodocs( word_sup_array, gamma, background_array, sequence_length, len_avg, len_std, beta, alpha, vocabulary_inv, embedding_mat, centers, kappa, 'cnn', './results/{}/{}/phase1/'.format(args.dataset, 'cnn')) if args.sup_source == 'docs': num_real_doc = len(sup_idx.flatten()) * int(1 + beta * 0.1) real_seed_docs, real_seed_label = augment(x, sup_idx, num_real_doc) seed_docs = np.concatenate((seed_docs, real_seed_docs), axis=0) seed_label = np.concatenate((seed_label, real_seed_label), axis=0) perm_seed = np.random.permutation(len(seed_label)) seed_docs = seed_docs[perm_seed] seed_label = seed_label[perm_seed] print('\n### Phase 2: pre-training with pseudo documents ###') wstc.pretrain(x=seed_docs, pretrain_labels=seed_label, sup_idx=sup_idx, optimizer=SGD(lr=0.1, momentum=0.9), epochs=pretrain_epochs, batch_size=args.batch_size,
def proceed_level(x, sequences, wstc, args, pretrain_epochs, self_lr, decay, update_interval, delta, class_tree, level, expand_num, background_array, doc_length, sent_length, len_avg, len_std, num_doc, interp_weight, vocabulary_inv, common_words): print(f"\n### Proceeding level {level} ###") dataset = args.dataset sup_source = args.sup_source maxiter = args.maxiter.split(',') maxiter = int(maxiter[level]) batch_size = args.batch_size parents = class_tree.find_at_level(level) parents_names = [parent.name for parent in parents] print(f'Nodes: {parents_names}') for parent in parents: # initialize classifiers in hierarchy print("\n### Input preparation ###") if class_tree.embedding is None: train_class_embedding(x, vocabulary_inv, dataset_name=args.dataset, node=class_tree) parent.embedding = class_tree.embedding wstc.instantiate(class_tree=parent) save_dir = f'./results/{dataset}/{sup_source}/level_{level}' if parent.model is not None: print("\n### Phase 1: vMF distribution fitting & pseudo document generation ###") if args.pseudo == "bow": print("Pseudo documents generation (Method: Bag-of-words)...") seed_docs, seed_label = bow_pseudodocs(parent.children, expand_num, background_array, doc_length, len_avg, len_std, num_doc, interp_weight, vocabulary_inv, parent.embedding, save_dir) elif args.pseudo == "lstm": print("Pseudo documents generation (Method: LSTM language model)...") lm = train_lstm(sequences, common_words, sent_length, f'./{dataset}/lm', embedding_matrix=class_tree.embedding) seed_docs, seed_label = lstm_pseudodocs(parent, expand_num, doc_length, len_avg, sent_length, len_std, num_doc, interp_weight, vocabulary_inv, lm, common_words, save_dir) print("Finished pseudo documents generation.") num_real_doc = len(seed_docs) / 5 if sup_source == 'docs': real_seed_docs, real_seed_label = augment(x, parent.children, num_real_doc) print(f'Labeled docs {len(real_seed_docs)} + Pseudo docs {len(seed_docs)}') seed_docs = np.concatenate((seed_docs, real_seed_docs), axis=0) seed_label = np.concatenate((seed_label, real_seed_label), axis=0) perm = np.random.permutation(len(seed_label)) seed_docs = seed_docs[perm] seed_label = seed_label[perm] print('\n### Phase 2: pre-training with pseudo documents ###') print(f'Pretraining node {parent.name}') wstc.pretrain(x=seed_docs, pretrain_labels=seed_label, model=parent.model, optimizer=SGD(lr=0.1, momentum=0.9), epochs=pretrain_epochs, batch_size=batch_size, save_dir=save_dir, suffix=parent.name) global_classifier = wstc.ensemble_classifier(level) wstc.model.append(global_classifier) t0 = time() print("\n### Phase 3: self-training ###") selftrain_optimizer = SGD(lr=self_lr, momentum=0.9, decay=decay) wstc.compile(level, optimizer=selftrain_optimizer, loss='kld') y_pred = wstc.fit(x, level=level, tol=delta, maxiter=maxiter, batch_size=batch_size, update_interval=update_interval, save_dir=save_dir) print(f'Self-training time: {time() - t0:.2f}s') return y_pred