Beispiel #1
0
    def process(self, data_loaders, outputs):

        beat_logger.debug("############### mt_train_initial_model")
        dl = data_loaders[0]
        (data, _, end_data_index) = dl[0]

        # separate train and dev data
        data_dict = pickle.loads(data["train_data"].text.encode("latin1"))
        self.data_dict_train, self.data_dict_dev = beat_separate_train_valid(
            data_dict)

        self.params['data'] = {}
        self.params['data']['train_set'] = {}
        self.params['data']['train_set']['src'] = self.data_dict_train['src']
        self.params['data']['train_set']['trg'] = self.data_dict_train['trg']
        self.params['data']['val_set'] = {}
        self.params['data']['val_set']['src'] = self.data_dict_dev['src']
        self.params['data']['val_set']['trg'] = self.data_dict_dev['trg']
        self.params['vocabulary'] = {}
        self.params['vocabulary']['src'] = data['source_vocabulary'].text
        self.params['vocabulary']['trg'] = data['target_vocabulary'].text
        self.params['filename'] = '/not/needed/beat_platform'
        self.params['sections'] = ['train', 'model', 'data', 'vocabulary']
        opts = Options.from_dict(self.params, {})

        setup_experiment(opts, beat_platform=True)
        dev_mgr = DeviceManager("gpu")

        # If given, seed that; if not generate a random seed and print it
        if opts.train['seed'] > 0:
            seed = fix_seed(opts.train['seed'])
        else:
            opts.train['seed'] = fix_seed()

        # Instantiate the model object
        model = getattr(models, opts.train['model_type'])(opts=opts,
                                                          beat_platform=True)

        beat_logger.info(
            "Python {} -- torch {} with CUDA {} (on machine '{}')".format(
                platform.python_version(), torch.__version__,
                torch.version.cuda, platform.node()))
        beat_logger.info("nmtpytorch {}".format(nmtpytorch.__version__))
        beat_logger.info(dev_mgr)
        beat_logger.info("Seed for further reproducibility: {}".format(
            opts.train['seed']))

        loop = MainLoop(model, opts.train, dev_mgr, beat_platform=True)
        model = loop()

        # The model is Pickled with torch.save() and converted into a 1D-array of uint8
        # Pass the model to the next block
        outputs['model'].write({'value': model}, end_data_index)

        beat_logger.debug("############## End of mt_train_model ############")
        return True
Beispiel #2
0
def unsupervised_model_adaptation(source,
                  file_id,
                  data_dict,
                  model,
                  translator,
                  current_hypothesis):

    #beat_logger.debug("### mt_lifelong_loop:unsupervised_model_adaptation")

    ########################################################
    # ACCESS TRAINING DATA TO ADAPT IF WANTED
    ########################################################
    # You are allowed to re-process training data all along the lifelong adaptation.
    # The code below shows how to acces data from the training set.

    # The training and valid data is accessible through the data_dict
    # See below how to get all training data
    # The function beat_separate_train_valid allows to split the data into train/valid as during training (with loss of document information)
    data_dict_train, data_dict_valid = beat_separate_train_valid(data_dict)

    # Get the list of training files
    # We create a dictionnary with file_id as keys and the index of the file in the training set as value.
    # This dictionnary will be used later to access the training files by file_id

    ########################################################
    # Shows how to access training data per INDEX
    #
    # Shows how to access source  and file_info from
    # a training file by using its index in the training set
    ########################################################
    file_index = 2  # index of the file we want to access
    #file_id_per_index = train_loader[file_index][0]['train_file_info'].file_id
    #time_stamp_per_index = train_loader[file_index][0]['train_file_info'].time_stamp
    #supervision_per_index = train_loader[file_index][0]['train_file_info'].supervision
    #source_per_index = train_loader[file_index][0]["train_source"].value

    ########################################################
    # Shows how to access training data per file_id
    #
    # Shows how to access source text and file_info from
    # a training file by using its file_id
    ########################################################
    # Assume that we know the file_id, note that we stored them earlier in a dictionnary
    train_file_id = list(data_dict.keys())[2]
    time_stamp_per_ID = data_dict[train_file_id]["file_info"]['time_stamp']
    supervision_per_ID = data_dict[train_file_id]["file_info"]['supervision']
    source_per_ID = data_dict[train_file_id]["source"]

    # TODO: Update the model and the translator object
    
    return model, translator
Beispiel #3
0
    def process(self, inputs, data_loaders, outputs, loop_channel):
        #print("### mt_lifelong_loop:process start -- that's great!")
        """

        """
        ########################################################
        # RECEIVE INCOMING INFORMATION FROM THE FILE TO PROCESS
        ########################################################

        # Access source text of the current file to process
        source = inputs["processor_lifelong_source"].data.text

        # recreate the translation model and train/dev data
        if self.model is None or self.data_dict_train is None:
            # Get the model after initial training
            dl = data_loaders[0]
            (data, _, end_index) = dl[0]

            # Store the baseline model
            if self.model is None:
                model_data = data['model'].value
                self.model = struct.pack('{}B'.format(len(model_data)),
                                         *list(model_data))

            if self.data_dict_train is None:
                data_dict = pickle.loads(
                    data["processor_train_data"].text.encode("latin1"))
                self.data_dict_train, self.data_dict_dev = beat_separate_train_valid(
                    data_dict)

        # Create a baseline Translator object from nmtpy
        self.translate_params['models'] = [self.model]
        self.translate_params['source'] = source
        translator = Translator(beat_platform=True, **self.translate_params)

        # train sentence vectors for data selection
        if self.train_sen_vecs is None:
            #Get the vocab from the opts of the model
            self.src_vocab = json.loads(
                translator.instances[0].opts['vocabulary']['src'])
            #Get the embeddings from the model's weights
            self.word_embs = translator.instances[0].enc.emb.weight
            # Create the sentence embeddings for the training data
            self.train_sen_vecs = get_sen_vecs(self.data_dict_train,
                                               self.src_vocab, self.word_embs)

        # Access incoming file information
        # See documentation for a detailed description of the mt_file_info
        file_info = inputs["processor_lifelong_file_info"].data
        file_id = file_info.file_id
        supervision = file_info.supervision
        time_stamp = file_info.time_stamp

        path_llnmt = '/home/barrault/msc/lifelongmt/'
        for p in ('original', 'adapted'):
            if not os.path.exists(path_llnmt + '{}'.format(p)):
                os.mkdir(path_llnmt + '{}'.format(p))

        original_file = path_llnmt + 'original/{}'.format(file_id)
        adapted_file = path_llnmt + 'adapted/{}'.format(file_id)

        beat_logger.debug(
            "mt_lifelong_loop::process: received document {} ({} sentences) to translate "
            .format(file_id, len(source)))
        #beat_logger.debug('mt_lifelong_loop::process: source = {}'.format(source))

        #TODO: prepare train/valid data for fine-tuning (eventually) -- might not be needed actually as the data is already contained in the training params -> this can be huge!
        current_hypothesis = run_translation(translator, source, file_id)
        with open(original_file, 'w') as f:
            for s in current_hypothesis:
                f.write(s)
                f.write('\n')
        # If human assisted learning is ON
        ###################################################################################################
        # Interact with the human if necessary
        # This section exchange information with the user simulation and ends up with a new hypothesis
        ###################################################################################################
        human_assisted_learning = supervision in ["active", "interactive"]
        # code not used!!
        if not human_assisted_learning:
            # In this method, see how to access initial training data to adapt the model
            # for the new incoming data
            self.adapted_model, adapted_translator = unsupervised_model_adaptation(
                source, file_id, self.train_data, self.model, translator,
                current_hypothesis)
            # update current_hypothesis with current model
            current_hypothesis = run_translation(adapted_translator, source,
                                                 file_id)

        # If human assisted learning mode is on (active or interactive learning)
        while human_assisted_learning:

            # Create an empty request that is used to initiate interactive learning
            # For the case of active learning, this request is overwritten by your system itself
            # For now, only requests of type 'reference' are allowed (i.e. give me the reference translation for sentence 'sentence_id' of file 'file_id')

            if supervision == "active":
                # The system can send a question to the human in the loop
                # by using an object of type request
                # The request is the question asked to the system
                request = generate_system_request_to_user(
                    file_id, source, current_hypothesis, self.qe_model)

                # Send the request to the user and wait for the answer
                message_to_user = {
                    "file_id":
                    file_id,  # ID of the file the question is related to
                    "hypothesis": current_hypothesis[
                        request['sentence_id']],  # The current hypothesis
                    "system_request":
                    request,  # the question for the human in the loop
                }
                #beat_logger.debug("mt_lifelong_loop::process: send message to user: request={}".format(request))
                human_assisted_learning, user_answer = loop_channel.validate(
                    message_to_user)

                # Take into account the user answer to generate a new hypothesis and possibly update the model
                adapted_model_data = online_adaptation(
                    self.model, self.translate_params, file_id,
                    request['sentence_id'], user_answer, source,
                    current_hypothesis, self.data_dict_train,
                    self.train_sen_vecs, self.src_vocab, self.word_embs)

                # Update the translator object with the current model
                self.adapted_model = struct.pack(
                    '{}B'.format(len(adapted_model_data)),
                    *list(adapted_model_data))
                self.adapted_translate_params['models'] = [self.adapted_model]
                self.adapted_translate_params['source'] = source
                adapted_translator = Translator(
                    beat_platform=True, **self.adapted_translate_params)

                # Generate a new translation
                new_hypothesis = run_translation(adapted_translator, source,
                                                 file_id)
                # NOTE: let's debug by simply using the previous translation
                #new_hypothesis = current_hypothesis

                with open(adapted_file, 'w') as fad:
                    for s in new_hypothesis:
                        fad.write(s)
                        fad.write('\n')

                #beat_logger.debug("BEFORE online_adaptation: {}".format(current_hypothesis))
                #beat_logger.debug("AFTER online_adaptation : {}".format(new_hypothesis))
                # Update the current hypothesis with the new one
                current_hypothesis = new_hypothesis
                human_assisted_learning = False

            else:
                human_assisted_learning = False

        # End of human assisted learning
        # Send the current hypothesis
        #    self.init_end_index = 0
        #beat_logger.debug("HYPOTHESIS: {}".format(current_hypothesis))
        print("mt_lifelong_loop::process: FINISHED translated document {}: ".
              format(file_id))
        outputs["hypothesis"].write(mt_to_allies(current_hypothesis))

        if not inputs.hasMoreData():
            pass
        # always return True, it signals BEAT to continue processing
        return True