Exemple #1
0
    def train_model(self):
        self.__begin()
        log_debug(self.final_model_path,
                  token_id=self.training_token,
                  prefix=self.common_prefix)
        log_debug(os.path.exists(self.final_model_path),
                  token_id=self.training_token,
                  prefix=self.common_prefix)

        should_train = not os.path.exists(self.final_model_path)

        has_error = self.__train() if should_train else self.__retrain(
            self.final_model_path)

        # Since python supports short-circuit operator for boolean conjunction
        if not has_error and self.__attach_to_networks():
            message = 'Dataset: {0} training successful'.format(
                self.dataset_name)
        else:
            message = 'Dataset: {0} training Failed'.format(self.dataset_name)

        log_error(message,
                  token_id=self.training_token,
                  prefix=self.common_prefix)
        self.__tear_down()
        return has_error, message
Exemple #2
0
    def __begin(self):
        log_info('\\' * 80,
                 token_id=self.training_token,
                 prefix=self.common_prefix)
        log_info(
            "Training Begin: Dataset={3}, prefix=multistep_{0}, SetType={1},TrainDocs={2}"
            .format(self.common_prefix, self.increment_type,
                    self.training_document_threshold, self.dataset_name),
            token_id=self.training_token,
            prefix=self.common_prefix)
        log_info('/' * 80,
                 token_id=self.training_token,
                 prefix=self.common_prefix)

        log_debug('-' * 80,
                  token_id=self.training_token,
                  prefix=self.common_prefix)
        log_debug('Training config',
                  token_id=self.training_token,
                  prefix=self.common_prefix)
        log_debug(self._config,
                  token_id=self.training_token,
                  prefix=self.common_prefix)
        log_debug('-' * 80,
                  token_id=self.training_token,
                  prefix=self.common_prefix)
        log_debug('enter Complete',
                  token_id=self.training_token,
                  prefix=self.common_prefix)
Exemple #3
0
def parse_course_department(soup):
    header = get_header_text(soup)
    name = header[header.find("("):]
    title = header[:-len(name)].rstrip()
    name = name[1:-1]
    log_debug(header)
    return {"title": title, "name": name}
Exemple #4
0
def parse_degree(soup):
    header = get_header_text(soup)
    level = header.split(",")[-1]
    subject = header[:-len(level) - 1]
    level = level.lstrip()
    log_debug(header)
    school, department = parse_school_department(soup)
    degree_requirements = parse_requirements(soup)
    return {
        "name": header,
        "subject": subject,
        "level": level,
        "school": school,
        "department": department,
        "requirements": degree_requirements
    }
Exemple #5
0
    def __can_train(self):
        log_debug('Training data set file options',
                  token_id=self.training_token,
                  prefix=self.prefix)
        log_debug(self.train_file_options,
                  token_id=self.training_token,
                  prefix=self.prefix)
        new_files = generate_train_file(self.train_file_options)

        # log_info('{0} training candidate files found'.format(new_files), token_id=self.training_token, prefix=self.prefix)

        can_proceed_to_training = new_files >= self.train_file_options[
            'training_document_threshold']

        if can_proceed_to_training:

            message = "Wrote/appended {0} new files to {1}/train.txt\nAnd logged them in {2}.".format(
                new_files, self.train_file_options['train_document_path'],
                self.train_file_options['train_history_file'])
            log_debug(msg=message,
                      token_id=self.training_token,
                      prefix=self.prefix)
        else:
            message = 'Not enough documents. need {0} more documents.'.format(
                self.train_file_options['training_document_threshold'] -
                new_files)
            log_warning(msg="Can not train. " + message,
                        token_id=self.training_token,
                        prefix=self.prefix)

        return can_proceed_to_training, message
Exemple #6
0
    def __attach_to_networks(self):
        try:
            new_model_path = self.train_stats['finalModel']
            log_debug('Before Update',
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            log_debug('networks id : {0}'.format(id(self.network_dict)),
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            replace_network = self.dataset_name in self.network_dict
            if replace_network:
                del self.network_dict[self.dataset_name]
                gc.collect()

            self.network_dict.update({self.dataset_name: self.trained_model})

            new_model_file = os.path.basename(new_model_path)
            dest_folder = os.path.dirname(self.final_model_path)

            shutil.copy2(new_model_path, dest_folder)

            log_debug(self.network_dict,
                      token_id=self.training_token,
                      prefix=self.common_prefix)

            if replace_network:
                os.rename(self.final_model_path,
                          self.final_model_path[:-3] + '.bak')
                os.rename(dest_folder + '/' + new_model_file,
                          self.final_model_path)
                os.remove(self.final_model_path[:-3] + '.bak')
            else:
                os.rename(dest_folder + '/' + new_model_file,
                          self.final_model_path)
            return True

        except Exception:
            log_error('Network replacement failed.',
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            log_error(traceback.format_exc(),
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            return False
Exemple #7
0
    def __retrain(self, model_path):
        has_error = False

        try:
            log_info("============== Re-training {0} ==============".format(
                self.training_token),
                     token_id=self.training_token,
                     prefix=self.common_prefix)

            model = BiLSTM(params=self.params,
                           fn_log_info=log_info,
                           fn_log_debug=log_debug,
                           training_token=self.training_token,
                           training_prefix=self.common_prefix)

            log_debug("Loading model %s" % model_path,
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            model.loadModel(model_path, "")
            log_debug("Loaded model %s" % model_path,
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            # os.remove(model_path)
        except Exception:
            log_error('Re-training: model loading failed.',
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            log_error(traceback.format_exc(),
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            has_error = True
        else:

            try:
                log_debug(model.mappings.keys())
                with DataGenerator(dataset_name=self.dataset_name,
                                   mappings=model.mappings,
                                   cols=self.data_columns) as generator:
                    log_debug("Train Sentences: %d" %
                              len(generator.data['trainMatrix']))
                    log_debug("Dev Sentences: %d" %
                              len(generator.data['devMatrix']))
                    log_debug("Test Sentences: %d" %
                              len(generator.data['testMatrix']))

                    model.setTrainDataset(generator.data, self.label_key)

                model.modelSavePath = self.transient_model_path

                self.train_stats = model.evaluate()
                log_debug("%s" % self.train_stats,
                          token_id=self.training_token,
                          prefix=self.common_prefix)
                self.trained_model = model
            except Exception:
                log_error('Re-training: failed.',
                          token_id=self.training_token,
                          prefix=self.common_prefix)
                log_error(traceback.format_exc(),
                          token_id=self.training_token,
                          prefix=self.common_prefix)
                has_error = True
            finally:
                gc.collect()
        finally:
            return has_error
Exemple #8
0
    def __train(self):
        has_error = False
        try:
            log_debug("Dataset: %s" % self.dataset_name,
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            log_debug("Label key: %s" % self.label_key,
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            log_info("============== Training: {0} ==============".format(
                self.training_token),
                     token_id=self.training_token,
                     prefix=self.common_prefix)

            ######################################################
            #
            # The training of the network starts here
            #
            ######################################################

            model = BiLSTM(params=self.params,
                           fn_log_info=log_info,
                           fn_log_debug=log_debug,
                           training_token=self.training_token,
                           training_prefix=self.common_prefix)
            with EmbeddingsAndDataGenerator(
                    embeddings_path=self.embeddings_path,
                    dataset_file=self.dataset_files,
                    reuse_embedding=self.reuse_embeddings) as generator:
                log_debug(generator.data['mappings'].keys(),
                          token_id=self.training_token,
                          prefix=self.common_prefix)
                log_info("Train Sentences: %d" %
                         len(generator.data['trainMatrix']),
                         token_id=self.training_token,
                         prefix=self.common_prefix)
                log_info("Dev Sentences: %d" %
                         len(generator.data['devMatrix']),
                         token_id=self.training_token,
                         prefix=self.common_prefix)
                log_info("Test Sentences: %d" %
                         len(generator.data['testMatrix']),
                         token_id=self.training_token,
                         prefix=self.common_prefix)
                model.setMappings(generator.embeddings,
                                  generator.data['mappings'])
                model.setTrainDataset(generator.data, self.label_key)

            model.verboseBuild = True
            model.modelSavePath = self.transient_model_path
            self.train_stats = model.evaluate()
            log_debug("%s" % self.train_stats,
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            # del word2Idx
            self.trained_model = model
        except Exception:
            log_error('Training: failed.',
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            log_error(traceback.format_exc(),
                      token_id=self.training_token,
                      prefix=self.common_prefix)
            has_error = True
        finally:
            gc.collect()

        return has_error