Esempio n. 1
0
        ])

        total_counts = sum(word_counts[ele] for ele in word_counts)
        total_counts -= word_counts[vocabulary_inv_list[0]]
        background_array = np.zeros(vocab_sz)
        for i in range(1, vocab_sz):
            background_array[i] = word_counts[vocabulary_inv[i]] / total_counts
        seed_docs, seed_label = pseudodocs(
            word_sup_array, gamma, background_array, sequence_length, len_avg,
            len_std, beta, alpha, vocabulary_inv, embedding_mat, centers,
            kappa, 'cnn',
            './results/{}/{}/phase1/'.format(args.dataset, 'cnn'))

        if args.sup_source == 'docs':
            num_real_doc = len(sup_idx.flatten()) * int(1 + beta * 0.1)
            real_seed_docs, real_seed_label = augment(x, sup_idx, num_real_doc)
            seed_docs = np.concatenate((seed_docs, real_seed_docs), axis=0)
            seed_label = np.concatenate((seed_label, real_seed_label), axis=0)

        perm_seed = np.random.permutation(len(seed_label))
        seed_docs = seed_docs[perm_seed]
        seed_label = seed_label[perm_seed]

        print('\n### Phase 2: pre-training with pseudo documents ###')

        wstc.pretrain(x=seed_docs,
                      pretrain_labels=seed_label,
                      sup_idx=sup_idx,
                      optimizer=SGD(lr=0.1, momentum=0.9),
                      epochs=pretrain_epochs,
                      batch_size=args.batch_size,
Esempio n. 2
0
def proceed_level(x, sequences, wstc, args, pretrain_epochs, self_lr, decay, update_interval,
                delta, class_tree, level, expand_num, background_array, doc_length, sent_length, len_avg,
                len_std, num_doc, interp_weight, vocabulary_inv, common_words):
    print(f"\n### Proceeding level {level} ###")
    dataset = args.dataset
    sup_source = args.sup_source
    maxiter = args.maxiter.split(',')
    maxiter = int(maxiter[level])
    batch_size = args.batch_size
    parents = class_tree.find_at_level(level)
    parents_names = [parent.name for parent in parents]
    print(f'Nodes: {parents_names}')
    
    for parent in parents:
        # initialize classifiers in hierarchy
        print("\n### Input preparation ###")

        if class_tree.embedding is None:
            train_class_embedding(x, vocabulary_inv, dataset_name=args.dataset, node=class_tree)
        parent.embedding = class_tree.embedding
        wstc.instantiate(class_tree=parent)
        
        save_dir = f'./results/{dataset}/{sup_source}/level_{level}'

        if parent.model is not None:
            
            print("\n### Phase 1: vMF distribution fitting & pseudo document generation ###")

            if args.pseudo == "bow":
                print("Pseudo documents generation (Method: Bag-of-words)...")
                seed_docs, seed_label = bow_pseudodocs(parent.children, expand_num, background_array, doc_length, len_avg,
                                                        len_std, num_doc, interp_weight, vocabulary_inv, parent.embedding, save_dir)
            elif args.pseudo == "lstm":
                print("Pseudo documents generation (Method: LSTM language model)...")
                lm = train_lstm(sequences, common_words, sent_length, f'./{dataset}/lm', embedding_matrix=class_tree.embedding)
                
                seed_docs, seed_label = lstm_pseudodocs(parent, expand_num, doc_length, len_avg, sent_length, len_std, num_doc, 
                                                        interp_weight, vocabulary_inv, lm, common_words, save_dir)
            
            print("Finished pseudo documents generation.")
            num_real_doc = len(seed_docs) / 5

            if sup_source == 'docs':
                real_seed_docs, real_seed_label = augment(x, parent.children, num_real_doc)
                print(f'Labeled docs {len(real_seed_docs)} + Pseudo docs {len(seed_docs)}')
                seed_docs = np.concatenate((seed_docs, real_seed_docs), axis=0)
                seed_label = np.concatenate((seed_label, real_seed_label), axis=0)

            perm = np.random.permutation(len(seed_label))
            seed_docs = seed_docs[perm]
            seed_label = seed_label[perm]

            print('\n### Phase 2: pre-training with pseudo documents ###')
            print(f'Pretraining node {parent.name}')

            wstc.pretrain(x=seed_docs, pretrain_labels=seed_label, model=parent.model,
                        optimizer=SGD(lr=0.1, momentum=0.9),
                        epochs=pretrain_epochs, batch_size=batch_size,
                        save_dir=save_dir, suffix=parent.name)

    global_classifier = wstc.ensemble_classifier(level)
    wstc.model.append(global_classifier)
    t0 = time()
    print("\n### Phase 3: self-training ###")
    selftrain_optimizer = SGD(lr=self_lr, momentum=0.9, decay=decay)
    wstc.compile(level, optimizer=selftrain_optimizer, loss='kld')
    y_pred = wstc.fit(x, level=level, tol=delta, maxiter=maxiter, batch_size=batch_size,
                      update_interval=update_interval, save_dir=save_dir)
    print(f'Self-training time: {time() - t0:.2f}s')
    return y_pred