def __init__(self, word_vocab_size, word_embedding_dim, word_out_channels, output_size, 
                 dropout_p=0.5,
                 pretrained=None,
                 cuda_device=0):
        
        super(CNN, self).__init__()
        self.cuda_device = cuda_device
        self.word_vocab_size = word_vocab_size
        self.word_embedding_dim = word_embedding_dim
        self.word_out_channels = word_out_channels
        
        self.initializer = Initializer()
        self.loader = Loader()

        self.embedding = nn.Embedding(word_vocab_size, word_embedding_dim)

        if pretrained is not None:
            self.embedding.weight = nn.Parameter(torch.FloatTensor(pretrained))

        #Q_CNN
        self.question_encoder = EncoderCNN(word_vocab_size, word_embedding_dim, word_out_channels)
        #A_CNN
        self.answer_encoder = EncoderCNN(word_vocab_size, word_embedding_dim, word_out_channels)

        self.interaction = nn.Parameter(torch.FloatTensor(word_out_channels, word_out_channels).uniform_(0, .1))
        self.dropout = nn.Dropout(p=dropout_p)
        
        hidden_size = word_out_channels * 2 + 1
        self.linear = nn.Linear(hidden_size, hidden_size//2)
        self.linear2 = nn.Linear(hidden_size//2, output_size)
        self.Tanh = nn.Tanh()
Esempio n. 2
0
    def __init__(self,
                 word_vocab_size,
                 word_embedding_dim,
                 word_hidden_dim,
                 output_size,
                 pretrained=None,
                 n_layers=1,
                 bidirectional=True,
                 dropout_p=0.5,
                 with_sim_features=True,
                 cuda_device=0):

        super(BiLSTM, self).__init__()

        self.cuda_device = cuda_device

        self.word_vocab_size = word_vocab_size
        self.word_embedding_dim = word_embedding_dim
        self.word_hidden_dim = word_hidden_dim

        self.initializer = Initializer()
        self.loader = Loader()

        self.with_sim_features = with_sim_features

        self.embedding = nn.Embedding(word_vocab_size, word_embedding_dim)

        if pretrained is not None:
            self.embedding.weight = nn.Parameter(torch.FloatTensor(pretrained))

        #Q_LSTM
        self.question_encoder = EncoderRNN(word_vocab_size,
                                           word_embedding_dim,
                                           word_hidden_dim,
                                           n_layers=n_layers,
                                           bidirectional=bidirectional,
                                           cuda_device=self.cuda_device)
        #A_LSTM
        self.answer_encoder = EncoderRNN(word_vocab_size,
                                         word_embedding_dim,
                                         word_hidden_dim,
                                         n_layers=n_layers,
                                         bidirectional=bidirectional,
                                         cuda_device=self.cuda_device)

        hidden_size = 2 * (2 * n_layers * word_hidden_dim
                           if bidirectional else n_layers * word_hidden_dim)

        if self.with_sim_features:
            word_out_dim = 2 * n_layers * word_hidden_dim if bidirectional else n_layers * word_hidden_dim
            self.interaction = nn.Parameter(
                torch.FloatTensor(word_out_dim, word_out_dim).uniform_(0, .1))
            hidden_size += 1

        self.dropout = nn.Dropout(p=dropout_p)
        self.linear = nn.Linear(hidden_size, hidden_size // 2)
        self.linear2 = nn.Linear(hidden_size // 2, output_size)
        self.Tanh = nn.Tanh()
Esempio n. 3
0
def main(args):

    task_seq = [
        # acquire_method(sub_acquire_method): random(""), no-dete("DAL","BALD"), dete("coreset","entropy",...)
        # ../../datasets/answer_selection/YahooCQA/data/data-FD/
        #evidence, diversity # BiLSTM CNN
        {
            "model_name": "BiLSTM",
            "group_name": "[tkde]BiLSTM+Pets+MRR+160+160",
            "max_performance": 0.80,
            "data_path":
            "../../datasets/answer_selection/YahooCQA/data/data-Pets/",
            "acquire_method": "no-dete",
            "sub_acquire_method": "DAL",
            "unsupervised_method": '',
            "submodular_k": 1.5,
            "num_acquisitions_round": 50,
            "init_question_num": 32,
            "acquire_question_num_per_round": 32,
            "warm_start_random_seed": 0,
            "sample_method": "No-Deterministic+DALLL2Layer+0",
        },
        {
            "model_name": "BiLSTM",
            "group_name": "[tkde]BiLSTM+Pets+MRR+160+160",
            "max_performance": 0.80,
            "data_path":
            "../../datasets/answer_selection/YahooCQA/data/data-Pets/",
            "acquire_method": "no-dete",
            "sub_acquire_method": "DAL",
            "unsupervised_method": '',
            "submodular_k": 1.5,
            "num_acquisitions_round": 50,
            "init_question_num": 32,
            "acquire_question_num_per_round": 32,
            "warm_start_random_seed": 16,
            "sample_method": "No-Deterministic+DALLL2Layer+16",
        },
        {
            "model_name": "BiLSTM",
            "group_name": "[tkde]BiLSTM+Pets+MRR+160+160",
            "max_performance": 0.80,
            "data_path":
            "../../datasets/answer_selection/YahooCQA/data/data-Pets/",
            "acquire_method": "no-dete",
            "sub_acquire_method": "DAL",
            "unsupervised_method": '',
            "submodular_k": 1.5,
            "num_acquisitions_round": 50,
            "init_question_num": 32,
            "acquire_question_num_per_round": 32,
            "warm_start_random_seed": 32,
            "sample_method": "No-Deterministic+DALLL2Layer+32",
        },
        {
            "model_name": "BiLSTM",
            "group_name": "[tkde]BiLSTM+Pets+MRR+160+160",
            "max_performance": 0.80,
            "data_path":
            "../../datasets/answer_selection/YahooCQA/data/data-Pets/",
            "acquire_method": "no-dete",
            "sub_acquire_method": "DAL",
            "unsupervised_method": '',
            "submodular_k": 1.5,
            "num_acquisitions_round": 50,
            "init_question_num": 32,
            "acquire_question_num_per_round": 32,
            "warm_start_random_seed": 64,
            "sample_method": "No-Deterministic+DALLL2Layer+64",
        },
        {
            "model_name": "BiLSTM",
            "group_name": "[tkde]BiLSTM+Pets+MRR+160+160",
            "max_performance": 0.80,
            "data_path":
            "../../datasets/answer_selection/YahooCQA/data/data-Pets/",
            "acquire_method": "no-dete",
            "sub_acquire_method": "DAL",
            "unsupervised_method": '',
            "submodular_k": 1.5,
            "num_acquisitions_round": 50,
            "init_question_num": 32,
            "acquire_question_num_per_round": 32,
            "warm_start_random_seed": 128,
            "sample_method": "No-Deterministic+DALLL2Layer+128",
        },
        {
            "model_name": "BiLSTM",
            "group_name": "[tkde]BiLSTM+Pets+MRR+160+160",
            "max_performance": 0.80,
            "data_path":
            "../../datasets/answer_selection/YahooCQA/data/data-Pets/",
            "acquire_method": "random",
            "sub_acquire_method": "",
            "unsupervised_method": '',
            "submodular_k": 1.5,
            "num_acquisitions_round": 50,
            "init_question_num": 32,
            "acquire_question_num_per_round": 32,
            "warm_start_random_seed": 0,
            "sample_method": "No-Deterministic+randommm2Layer+0",
        },
        {
            "model_name": "BiLSTM",
            "group_name": "[tkde]BiLSTM+Pets+MRR+160+160",
            "max_performance": 0.80,
            "data_path":
            "../../datasets/answer_selection/YahooCQA/data/data-Pets/",
            "acquire_method": "random",
            "sub_acquire_method": "",
            "unsupervised_method": '',
            "submodular_k": 1.5,
            "num_acquisitions_round": 50,
            "init_question_num": 32,
            "acquire_question_num_per_round": 32,
            "warm_start_random_seed": 16,
            "sample_method": "No-Deterministic+randommm2Layer+16",
        },
        {
            "model_name": "BiLSTM",
            "group_name": "[tkde]BiLSTM+Pets+MRR+160+160",
            "max_performance": 0.80,
            "data_path":
            "../../datasets/answer_selection/YahooCQA/data/data-Pets/",
            "acquire_method": "random",
            "sub_acquire_method": "",
            "unsupervised_method": '',
            "submodular_k": 1.5,
            "num_acquisitions_round": 50,
            "init_question_num": 32,
            "acquire_question_num_per_round": 32,
            "warm_start_random_seed": 32,
            "sample_method": "No-Deterministic+randommm2Layer+32",
        },
        {
            "model_name": "BiLSTM",
            "group_name": "[tkde]BiLSTM+Pets+MRR+160+160",
            "max_performance": 0.80,
            "data_path":
            "../../datasets/answer_selection/YahooCQA/data/data-Pets/",
            "acquire_method": "random",
            "sub_acquire_method": "",
            "unsupervised_method": '',
            "submodular_k": 1.5,
            "num_acquisitions_round": 50,
            "init_question_num": 32,
            "acquire_question_num_per_round": 32,
            "warm_start_random_seed": 64,
            "sample_method": "No-Deterministic+randommm2Layer+64",
        },
        {
            "model_name": "BiLSTM",
            "group_name": "[tkde]BiLSTM+Pets+MRR+160+160",
            "max_performance": 0.80,
            "data_path":
            "../../datasets/answer_selection/YahooCQA/data/data-Pets/",
            "acquire_method": "random",
            "sub_acquire_method": "",
            "unsupervised_method": '',
            "submodular_k": 1.5,
            "num_acquisitions_round": 50,
            "init_question_num": 32,
            "acquire_question_num_per_round": 32,
            "warm_start_random_seed": 128,
            "sample_method": "No-Deterministic+randommm2Layer+128",
        },
    ]

    allMethods_results = [
    ]  #Record the performance results of each method during active learning

    for config in task_seq:

        print("-------------------{}-{}-------------------".format(
            config["group_name"], config["sample_method"]))

        ####################################### initial setting ###########################################
        data_path = config["data_path"]
        model_name = config[
            "model_name"] if "model_name" in config else 'BiLSTM'
        num_acquisitions_round = config["num_acquisitions_round"]
        acquire_method = config["acquire_method"]
        sub_acquire_method = config["sub_acquire_method"]
        init_question_num = config[
            "init_question_num"] if "init_question_num" in config else 160  # number of initial training samples
        acquire_question_num_per_round = config[
            "acquire_question_num_per_round"] if "acquire_question_num_per_round" in config else 20  #Number of samples collected per round
        warm_start_random_seed = config[
            "warm_start_random_seed"]  # the random seed for selecting the initial training set
        sample_method = config["sample_method"]

        loader = Loader()

        print('model:', model_name)
        print('dataset:', data_path)
        print('acquisition method:', acquire_method, "+", sub_acquire_method)

        if not os.path.exists(args.result_path):
            os.makedirs(args.result_path)

        if not os.path.exists(os.path.join(args.result_path, model_name)):
            os.makedirs(os.path.join(args.result_path, model_name))

        if not os.path.exists(
                os.path.join(args.result_path, model_name, 'active_checkpoint',
                             acquire_method)):
            os.makedirs(
                os.path.join(args.result_path, model_name, 'active_checkpoint',
                             acquire_method))

        #### If the data is not compiled, compile; otherwise load directly
        if (os.path.exists(os.path.join(data_path, 'mappings.pkl'))
                and os.path.exists(os.path.join(data_path, 'train.pkl'))
                and os.path.exists(os.path.join(data_path, 'test.pkl'))):
            mappings = pkl.load(
                open(os.path.join(data_path, 'mappings.pkl'), 'rb'))
            train_data = pkl.load(
                open(os.path.join(data_path, 'train.pkl'), 'rb'))
            test_data = pkl.load(
                open(os.path.join(data_path, 'test.pkl'), 'rb'))
        else:
            train_data, test_data, mappings = loader.load_yahoo(
                data_path, args.pretrained_word_embedding,
                args.word_embedding_dim, args.answer_count)
            pkl.dump(mappings,
                     open(os.path.join(data_path, 'mappings.pkl'), 'wb'))
            pkl.dump(train_data,
                     open(os.path.join(data_path, 'train.pkl'), 'wb'))
            pkl.dump(test_data, open(os.path.join(data_path, 'test.pkl'),
                                     'wb'))

        #word embedding
        word_to_id = mappings['word_to_id']
        tag_to_id = mappings['tag_to_id']
        word_embeds = mappings[
            'word_embeds'] if args.use_pretrained_word_embedding else None

        word_vocab_size = len(word_to_id)

        print(
            ' The total amount of training data:%d\n' % len(
                train_data
            ),  # Total number of training samples (number of question answer pair)
            'The total amount of val data:%d\n' % len(test_data))

        acquisition_function = Acquisition(train_data,
                                           seed=warm_start_random_seed,
                                           answer_count=args.answer_count,
                                           cuda_device=args.device[0],
                                           batch_size=args.sampling_batch_size,
                                           submodular_k=config["submodular_k"])

        checkpoint_path = os.path.join(args.result_path, 'active_checkpoint',
                                       config["group_name"], sample_method)
        if not os.path.exists(checkpoint_path):
            os.makedirs(checkpoint_path)

        method_result = [
        ]  # Record the performance results of each method during active learning
        ####################################### acquire data and retrain ###########################################
        for i in range(num_acquisitions_round):

            print("current round:{}".format(i))

            #-------------------acquisition---------------------
            if i == 0:  #first round
                acq = init_question_num
                a_m = "random"
                m_p = ""
            else:
                acq = acquire_question_num_per_round
                a_m = acquire_method
                m_p = os.path.join(checkpoint_path, 'modelweights')

            acquisition_function.obtain_data(
                train_data,
                model_path=m_p,
                model_name=model_name,
                acquire_questions_num=acq,
                method=a_m,
                sub_method=sub_acquire_method,
                unsupervised_method=config["unsupervised_method"])

            # -------------------prepare training data---------------------
            '''
            train_data format:
            {
                'str_words_q': str_words_q,  # question word segmentation
                'str_words_a': str_words_a,  # answer word segmentation
                'words_q': words_q,  # question word id
                'words_a': words_a,  # answer word id
                'tag': tag,  # sample tag id
            }
            '''

            new_train_index = (acquisition_function.train_index).copy()
            sorted_train_index = list(new_train_index)
            sorted_train_index.sort()
            labeled_train_data = [train_data[i] for i in sorted_train_index]

            print("Labeled training samples: {}".format(
                len(acquisition_function.train_index)))

            # -------------------------------------train--------------------------------------

            print('.............Recreate the model...................')
            if model_name == 'BiLSTM':
                model = BiLSTM(word_vocab_size,
                               args.word_embedding_dim,
                               args.word_hidden_dim,
                               args.target_size,
                               pretrained=word_embeds,
                               with_sim_features=args.with_sim_feature,
                               cuda_device=args.device[0])
            if model_name == 'CNN':
                model = CNN(word_vocab_size,
                            args.word_embedding_dim,
                            args.word_out_channels,
                            args.target_size,
                            pretrained=word_embeds,
                            cuda_device=args.device[0])

            model.cuda(args.device[0])

            trainer = Trainer(model,
                              model_name,
                              tag_to_id,
                              answer_count=args.answer_count,
                              cuda_device=args.device[0],
                              sampling_number=args.sampling_number)

            test_performance = trainer.train_supervisedLearning(
                args.num_epochs,
                labeled_train_data,
                test_data,
                args.learning_rate,
                checkpoint_path=checkpoint_path,
                batch_size=args.batch_size)

            print('.' * 50)
            print("Test performance: {}".format(test_performance))
            print('*' * 50)

            #--------------------------Send data for a visual web page------------------------------
            max_performance = config[
                "max_performance"] if "max_performance" in config else 0

            if "group_name" in config:
                updateLineChart(str(test_performance),
                                sample_method,
                                gp_name=config["group_name"],
                                max=max_performance)
            else:
                updateLineChart(str(test_performance),
                                sample_method,
                                max=max_performance)

        #     method_result.append(test_performance)
        #
        # print("acquire_method: {},sub_acquire_method: {}, warm_start_random_seed{}"
        #       .format(acquire_method, sub_acquire_method, warm_start_random_seed))
        # print(method_result)
        # allMethods_results.append(method_result)
        shutil.rmtree(checkpoint_path)
Esempio n. 4
0
def main(args):

    task_seq = [
        # The config for a task:
        # acquire_method(sub_acquire_method): random(""), no-dete("DAL","BALD"), dete("coreset","entropy",...)
        # "../../datasets/answer_selection/YahooCQA/data/data-FD/"

        {
        #     "model_name": "CNN",
        #     "group_name": "[mlabs]KIM+DAL+1e4trn",
        #     "max_performance": 0.90,
        #     "data_path": "../../datasets/rcv2/",
        #     "acquire_method": "random",
        #     "sub_acquire_method": "",
        #     "unsupervised_method": 'submodular',
        #     "submodular_k": 2,
        #     "num_acquisitions_round": 20,
        #     "init_question_num": 400,
        #     "acquire_question_num_per_round": 400,
        #     "warm_start_random_seed": 0,
        #     "sample_method": "Random+400*20-800b-0",
        # },{
        #     "model_name": "CNN",
        #     "group_name": "[mlabs]KIM+DAL+1e4trn",
        #     "max_performance": 0.90,
        #     "data_path": "../../datasets/rcv2/",
        #     "acquire_method": "random",
        #     "sub_acquire_method": "",
        #     "unsupervised_method": 'submodular',
        #     "submodular_k": 2,
        #     "num_acquisitions_round": 20,
        #     "init_question_num": 400,
        #     "acquire_question_num_per_round": 400,
        #     "warm_start_random_seed": 16,
        #     "sample_method": "Random+400*20-800b-16",
        # },{
        #     "model_name": "CNN",
        #     "group_name": "[mlabs]KIM+DAL+1e4trn",
        #     "max_performance": 0.90,
        #     "data_path": "../../datasets/rcv2/",
        #     "acquire_method": "random",
        #     "sub_acquire_method": "",
        #     "unsupervised_method": 'submodular',
        #     "submodular_k": 2,
        #     "num_acquisitions_round": 20,
        #     "init_question_num": 400,
        #     "acquire_question_num_per_round": 400,
        #     "warm_start_random_seed": 32,
        #     "sample_method": "Random+400*20-800b-32",
        # },{
        #     "model_name": "CNN",
        #     "group_name": "[mlabs]KIM+DAL+1e4trn",
        #     "max_performance": 0.90,
        #     "data_path": "../../datasets/rcv2/",
        #     "acquire_method": "random",
        #     "sub_acquire_method": "",
        #     "unsupervised_method": 'submodular',
        #     "submodular_k": 2,
        #     "num_acquisitions_round": 20,
        #     "init_question_num": 400,
        #     "acquire_question_num_per_round": 400,
        #     "warm_start_random_seed": 64,
        #     "sample_method": "Random+400*20-800b-64",
        # },
        #
        # {
        #     "model_name": "CNN",
        #     "group_name": "[mlabs]KIM+DAL+1e4trn",
        #     "max_performance": 0.90,
        #     "data_path": "../../datasets/rcv2/",
        #     "acquire_method": "no-dete",
        #     "sub_acquire_method": "DAL",
        #     "unsupervised_method": 'submodular',
        #     "submodular_k": 2,
        #     "num_acquisitions_round": 20,
        #     "init_question_num": 400,
        #     "acquire_question_num_per_round": 400,
        #     "warm_start_random_seed": 0,
        #     "sample_method": "No-Deterministic+DAL-400*20-800b+0",
        # },{
        #     "model_name": "CNN",
        #     "group_name": "[mlabs]KIM+DAL+1e4trn",
        #     "max_performance": 0.90,
        #     "data_path": "../../datasets/rcv2/",
        #     "acquire_method": "no-dete",
        #     "sub_acquire_method": "DAL",
        #     "unsupervised_method": 'submodular',
        #     "submodular_k": 2,
        #     "num_acquisitions_round": 20,
        #     "init_question_num": 400,
        #     "acquire_question_num_per_round": 400,
        #     "warm_start_random_seed": 16,
        #     "sample_method": "No-Deterministic+DAL-400*20-800b+16",
        # },{
            "model_name": "CNN",
            "group_name": "[mlabs]KIM+DAL+1e4trn",
            "max_performance": 0.90,
            "data_path": "../../datasets/rcv2/",
            "acquire_method": "no-dete",
            "sub_acquire_method": "BEL",
            "unsupervised_method": 'submodular',
            "submodular_k": 2,
            "num_acquisitions_round": 20,
            "init_question_num": 400,
            "acquire_question_num_per_round": 400,
            "warm_start_random_seed": 0,
            "sample_method": "No-Deterministic+BEL-400*20-800b+0",
        },{
            "model_name": "CNN",
            "group_name": "[mlabs]KIM+DAL+1e4trn",
            "max_performance": 0.90,
            "data_path": "../../datasets/rcv2/",
            "acquire_method": "no-dete",
            "sub_acquire_method": "BEL",
            "unsupervised_method": 'submodular',
            "submodular_k": 2,
            "num_acquisitions_round": 20,
            "init_question_num": 400,
            "acquire_question_num_per_round": 400,
            "warm_start_random_seed": 64,
            "sample_method": "No-Deterministic+BEL-400*20-800b+64",
        },





    ]

    allMethods_results = []   #Record the performance results of each method during active learning

    for config in task_seq:

        print("-------------------{}-{}-------------------".format(config["group_name"], config["sample_method"]))

        ####################################### initial setting ###########################################
        data_path = config["data_path"]
        model_name = config["model_name"] if "model_name" in config else 'CNN'
        num_acquisitions_round = config["num_acquisitions_round"]
        acquire_method = config["acquire_method"]
        sub_acquire_method = config["sub_acquire_method"]
        init_question_num = config["init_question_num"] if "init_question_num" in config else 800 # number of initial training samples
        acquire_question_num_per_round = config["acquire_question_num_per_round"] if "acquire_question_num_per_round" in config else 100 #Number of samples collected per round
        warm_start_random_seed = config["warm_start_random_seed"]  # the random seed for selecting the initial training set
        sample_method = config["sample_method"]
        # visual_data_path = os.path.join("result", sample_method + ".txt")

        loader = Loader()

        print('model:', model_name)
        print('dataset:', data_path)
        print('acquisition method:', acquire_method, "+", sub_acquire_method)

        if not os.path.exists(args.result_path):
            os.makedirs(args.result_path)

        if not os.path.exists(os.path.join(args.result_path, model_name)):
            os.makedirs(os.path.join(args.result_path, model_name))

        if not os.path.exists(os.path.join(args.result_path, model_name, 'active_checkpoint', acquire_method)):
            os.makedirs(os.path.join(args.result_path, model_name, 'active_checkpoint', acquire_method))

        data = loader.load_rcv2(data_path)

        train_data = data['train_points']
        val_data = data['test_points']

        train_data  = train_data[:10000]
        val_data    = val_data[:2000]

        #word embedding
        word_embeds = data['embed'] if args.use_pretrained_word_embedding else None

        word_vocab_size = len(data['vocab'][1])

        print(' The total amount of training data:%d\n' %len(train_data),   # Total number of training samples (number of question answer pair)
              'The total amount of val data:%d\n' %len(val_data),
              'The total amount of test data:%d' %len(val_data))

        acquisition_function = Acquisition(train_data,
                                            seed=warm_start_random_seed,
                                            cuda_device=args.device[0],
                                            batch_size=args.sampling_batch_size,
                                            submodular_k=config["submodular_k"])

        checkpoint_path = os.path.join(args.result_path, 'active_checkpoint', config["group_name"], sample_method)
        if not os.path.exists(checkpoint_path):
            os.makedirs(checkpoint_path)

        # with open(visual_data_path, 'a') as f:
        #     print(config["group_name"],sample_method,num_acquisitions_round,sep='\t',file=f)

        method_result = []  # Record the performance results of each method during active learning
        ####################################### acquire data and retrain ###########################################
        for i in range(num_acquisitions_round):

            print("current round:{}".format(i))

            #-------------------acquisition---------------------
            if i == 0:#first round
                acq = init_question_num
                a_m = "random"
                m_p = ""
            else:
                acq = acquire_question_num_per_round
                a_m = acquire_method
                m_p = os.path.join(checkpoint_path, 'modelweights')

            acquisition_function.obtain_data(train_data,
                                             model_path=m_p,
                                             model_name=model_name,
                                             acquire_num=acq,
                                             method=a_m,
                                             sub_method=sub_acquire_method,
                                             unsupervised_method=config["unsupervised_method"],
                                             round = i)

            # -------------------prepare training data---------------------
            '''
            train_data format:
            {
                'str_words_q': str_words_q,  # question word segmentation
                'str_words_a': str_words_a,  # answer word segmentation
                'words_q': words_q,  # question word id
                'words_a': words_a,  # answer word id
                'tag': tag,  # sample tag id
            }
            '''

            new_train_index = (acquisition_function.train_index).copy()
            sorted_train_index = list(new_train_index)
            sorted_train_index.sort()
            labeled_train_data = [train_data[i] for i in sorted_train_index]

            print("Labeled training samples: {}".format(len(acquisition_function.train_index)))

            # -------------------------------------train--------------------------------------


            print('.............Recreate the model...................')
            if model_name == 'BiLSTM':
                    model = BiLSTM(word_vocab_size,
                                   args.word_embedding_dim,
                                   args.word_hidden_dim,
                                   args.target_size,
                                   pretrained=word_embeds,
                                   with_sim_features=args.with_sim_feature,
                                   cuda_device=args.device[0],
                                   )
            if model_name == 'CNN':
                    model = CNN(word_vocab_size,
                                args.word_embedding_dim,
                                args.word_out_channels,
                                args.target_size,
                                pretrained=word_embeds,
                                cuda_device=args.device[0],)

            model.cuda(args.device[0])

            trainer = Trainer(model,
                              args.result_path,
                              model_name,
                              eval_begin=1,
                              cuda_device=args.device[0],
                              top_k= args.top_k
                              )

            test_performance = trainer.train_supervisedLearning(args.num_epochs,
                                                                labeled_train_data,
                                                                val_data,
                                                                args.learning_rate,
                                                                checkpoint_path=checkpoint_path,
                                                                batch_size=args.batch_size
                                                                )

            print('.' * 50)
            print("Test performance: {}".format(test_performance))
            print('*' * 50)

            #--------------------------Send data for a visual web page------------------------------
            max_performance = config["max_performance"] if "max_performance" in config else 0

            if "group_name" in config:
                updateLineChart(str(test_performance), sample_method, gp_name=config["group_name"], max=max_performance)
            else:
                updateLineChart(str(test_performance), sample_method, max=max_performance)

            method_result.append(test_performance)

            # if not os.path.exists(visual_data_path): # 被zip.sh删掉了,需要重新创建,并写头信息
            #     with open(visual_data_path, 'a') as f:
            #         print(config["group_name"], sample_method, num_acquisitions_round, sep='\t', file=f)
            # with open(visual_data_path, 'a') as f:
            #     print("acq round {} : \t {}"
            #           .format(i,test_performance),
            #           file=f)

        print("acquire_method: {},sub_acquire_method: {}, warm_start_random_seed{}"
              .format(acquire_method, sub_acquire_method, warm_start_random_seed))
        print(method_result)
        # with open(visual_data_path,'a') as f:
        #     print("acquire_method: {},sub_acquire_method: {}, warm_start_random_seed{}"
        #           .format(acquire_method, sub_acquire_method, warm_start_random_seed),
        #           file=f )
        #     print(method_result, file=f )
        #     print('', file=f)

        allMethods_results.append(method_result)
        shutil.rmtree(checkpoint_path)
        with open(config["group_name"]+sample_method.split('+')[1].split('-')[0]+"_detail.pkl",'wb') as f:
            pkl.dump(acquisition_function.savedData, f)
def main(args):

    task_seq = [

        # The config for a task:
        # acquire_method(sub_acquire_method): random(""), no-dete("DASL","DAL","BALD"), dete("coreset","entropy",...)
        {
            "model_name": "BiLSTM",
            "group_name": "[2.18-?]BiLSTM+FD+MRR+200+200",
            "max_performance": 0.80,
            "data_path": "data/YahooCQA/data-FD/",
            "acquire_method": "no-dete",
            "sub_acquire_method": "DASL",
            "num_acquisitions_round": 37,
            "init_question_num": 40,
            "acquire_question_num_per_round": 40,
            "warm_start_random_seed": 16,
            "sample_method": "No-Deterministic+DASL2+seed16",
        },
    ]

    allMethods_results = [
    ]  #Record the performance results of each method during active learning

    for config in task_seq:

        print("-----------------------{}-{}-----------------------".format(
            config["group_name"], config["sample_method"]))

        ####################################### initial setting ###########################################
        data_path = config[
            "data_path"] if "data_path" in config else "data/YahooCQA/data-FD/"
        model_name = config["model_name"] if "model_name" in config else 'CNN'
        num_acquisitions_round = config["num_acquisitions_round"]
        acquire_method = config["acquire_method"]
        sub_acquire_method = config["sub_acquire_method"]
        init_question_num = config[
            "init_question_num"] if "init_question_num" in config else 160  # number of initial training samples
        acquire_question_num_per_round = config[
            "acquire_question_num_per_round"] if "acquire_question_num_per_round" in config else 20  #Number of samples collected per round
        warm_start_random_seed = config[
            "warm_start_random_seed"]  # the random seed for selecting the initial training set
        sample_method = config["sample_method"]

        loader = Loader()

        print('model:', model_name)
        print('dataset:', data_path)
        print('acquisition method:', acquire_method, "+", sub_acquire_method)

        if not os.path.exists(args.result_path):
            os.makedirs(args.result_path)

        if not os.path.exists(os.path.join(args.result_path, model_name)):
            os.makedirs(os.path.join(args.result_path, model_name))

        if not os.path.exists(
                os.path.join(args.result_path, model_name, 'active_checkpoint',
                             acquire_method)):
            os.makedirs(
                os.path.join(args.result_path, model_name, 'active_checkpoint',
                             acquire_method))

        #### If the data is not compiled, compile; otherwise load directly
        if (os.path.exists(os.path.join(data_path, 'mappings.pkl'))
                and os.path.exists(os.path.join(data_path, 'train.pkl'))
                and os.path.exists(os.path.join(data_path, 'val.pkl'))
                and os.path.exists(os.path.join(data_path, 'test.pkl'))):
            mappings = pkl.load(
                open(os.path.join(data_path, 'mappings.pkl'), 'rb'))
            train_data = pkl.load(
                open(os.path.join(data_path, 'train.pkl'), 'rb'))
            val_data = pkl.load(open(os.path.join(data_path, 'val.pkl'), 'rb'))
            test_data = pkl.load(
                open(os.path.join(data_path, 'test.pkl'), 'rb'))
        else:
            train_data, val_data, test_data, mappings = loader.load_yahoo(
                data_path, args.pretrained_word_embedding,
                args.word_embedding_dim, args.answer_count)
            pkl.dump(train_data,
                     open(os.path.join(data_path, 'train.pkl'), 'wb'))
            pkl.dump(val_data, open(os.path.join(data_path, 'val.pkl'), 'wb'))
            pkl.dump(test_data, open(os.path.join(data_path, 'test.pkl'),
                                     'wb'))
            pkl.dump(mappings,
                     open(os.path.join(data_path, 'mappings.pkl'), 'wb'))

        #word embedding
        word_to_id = mappings['word_to_id']
        tag_to_id = mappings['tag_to_id']
        word_embeds = mappings[
            'word_embeds'] if args.use_pretrained_word_embedding else None

        word_vocab_size = len(word_to_id)

        total_sentences = len(
            train_data
        )  # Total number of training samples (number of question answer pair)

        print(
            'After training data is loaded, the total amount of training data: %d'
            % total_sentences)

        acquisition_function = Acquisition(train_data,
                                           seed=warm_start_random_seed,
                                           answer_count=args.answer_count)

        method_result = [
        ]  # Record the performance results of each method during active learning
        ####################################### acquire data and retrain ###########################################
        for i in range(num_acquisitions_round):

            print("current round:{}".format(i))

            #-------------------acquisition---------------------
            if i == 0:  #first round
                acq = init_question_num
                a_m = "random"
                m_p = ""
                acquisition_function.obtain_data(train_data,
                                                 model_path="",
                                                 acquire=init_question_num,
                                                 method="random")
            else:
                acq = acquire_question_num_per_round
                a_m = acquire_method
                m_p = os.path.join(checkpoint_path, 'modelweights')
                acquisition_function.obtain_data(model_path=m_p,
                                                 model_name=model_name,
                                                 data=train_data,
                                                 acquire=acq,
                                                 method=a_m,
                                                 sub_method=sub_acquire_method)

            # -------------------prepare training data---------------------
            '''
            train_data的每个元素格式:
            {
                'str_words_q': str_words_q,  # question word segmentation
                'str_words_a': str_words_a,  # answer word segmentation
                'words_q': words_q,  # question word id
                'words_a': words_a,  # answer word id
                'tag': tag,  # sample tag id
            }
            '''

            new_train_index = (acquisition_function.train_index).copy()
            sorted_train_index = list(new_train_index)
            sorted_train_index.sort()
            labeled_train_data = [train_data[i] for i in sorted_train_index]

            active_train_data = dict()
            active_train_data['labeled_train_data'] = labeled_train_data
            active_train_data[
                'pseudo_train_data'] = acquisition_function.pseudo_train_data

            print("Labeled training samples: {}".format(
                len(acquisition_function.train_index)))
            print("Unlabeled sample remaining: {}".format(
                len(train_data) - len(acquisition_function.train_index)))

            # -------------------------------------train--------------------------------------
            checkpoint_folder = os.path.join('active_checkpoint',
                                             acquire_method, "fixed")
            checkpoint_path = os.path.join(args.result_path, model_name,
                                           checkpoint_folder)
            if not os.path.exists(checkpoint_path):
                os.makedirs(checkpoint_path)

            print('.............Recreate the model...................')
            if model_name == 'BiLSTM':
                model = BiLSTM(word_vocab_size,
                               args.word_embedding_dim,
                               args.word_hidden_dim,
                               args.target_size,
                               pretrained=word_embeds,
                               with_sim_features=args.with_sim_feature,
                               double_embedding=args.double_embedding)
            if model_name == 'CNN':
                model = CNN(word_vocab_size,
                            args.word_embedding_dim,
                            args.word_out_channels,
                            args.target_size,
                            pretrained=word_embeds,
                            double_embedding=args.double_embedding)

            model.cuda()

            trainer = Trainer(model,
                              args.result_path,
                              model_name,
                              tag_to_id,
                              answer_count=args.answer_count)

            if active_train_data['pseudo_train_data']:

                noActiveTrain = {
                    "acquisition_function": acquisition_function,
                    "model_path": m_p,
                    "model_name": model_name,
                    "train_data": train_data,
                    "acquire": acq,
                    "method": a_m,
                    "sub_method": sub_acquire_method
                }

                test_performance = trainer.train_selfPacedLearning(
                    noActiveTrain,
                    args.num_epochs,
                    active_train_data,
                    val_data,
                    test_data,
                    args.mu,
                    args.learning_rate,
                    checkpoint_folder=checkpoint_folder,
                    batch_size=args.batch_size)
            else:
                test_performance = trainer.train_supervisedLearning(
                    args.num_epochs,
                    active_train_data,
                    val_data,
                    test_data,
                    args.learning_rate,
                    checkpoint_folder=checkpoint_folder,
                    batch_size=args.batch_size)

            print('*' * 50)
            print("Test performance: {}".format(test_performance))
            print('-' * 80)

            #--------------------------Send data for a visual web page------------------------------
            max_performance = config[
                "max_performance"] if "max_performance" in config else 0

            if "group_name" in config:
                updateLineChart(str(test_performance),
                                sample_method,
                                gp_name=config["group_name"],
                                max=max_performance)
            else:
                updateLineChart(str(test_performance),
                                sample_method,
                                max=max_performance)
Esempio n. 6
0
np.random.seed(0)
random.seed(0)


def theLoss(x, target):
    # print('x:',x.size())
    # print('target:', target.size())
    # return nn.BCELoss()(x.squeeze(2), target)
    # return nn.BCELoss()(x, target)
    return nn.MultiLabelSoftMarginLoss()(x, target)


eurlex_path = r"../../datasets/eurLex/"
rcv2_path = r"../../datasets/rcv2/"

Loader = Loader()
data = Loader.load_rcv2(datapath=rcv2_path, vocab_size=30000)

train_data = data['train_points']
val_data = data['test_points']

train_data = train_data[:8000]
# too small the valdata amount so ...
val_data = val_data[:2000]

model = CNN(word_vocab_size=30000,
            word_embedding_dim=300,
            word_out_channels=200,
            output_size=103,
            pretrained=data['embed'])
model = nn.DataParallel(model).cuda()