Ejemplo n.º 1
0
    def __init__(
        self,
        url,
        tokenizer,
        model_step,
    ):

        self.tokenizer = tokenizer
        self.val_db = validation_database.ValidationDatabase(url)
        self.val_files = {}
        self.val_id = self.val_db.count
        self.model_step = model_step
        self.mask_accuracy = [0, 0]
        self.nsp_accuracy = [0, 0]
        return
Ejemplo n.º 2
0
 def _writeValidation(self, result, tf_set) -> None:
     db = validation_database.ValidationDatabase("sqlite:///{}".format(
         str(self.logfile_path / "validation_samples.db")))
     r = ["{}: {}".format(key, str(result[key])) for key in result.keys()]
     with db.Session(commit=True) as session:
         exists = session.query(
             validation_database.ValResults.key).filter_by(
                 key=str(tf_set)).scalar() is not None
         if exists:
             entry = session.query(
                 validation_database.ValResults).filter_by(
                     key=str(tf_set)).first()
             entry.results = "\n".join(r)
         else:
             session.add(
                 validation_database.ValResults(key=str(tf_set),
                                                results="\n".join(r)))
     return
Ejemplo n.º 3
0
def parseValidationDB(db_path):

    validation_db = {
        'val_sample_count': -1,
        'path': None,
        'val_metrics': [],
        'val_samples': [],
    }
    try:
        if db_path.exists():
            validation_db['path'] = "sqlite:///{}".format(db_path)
            val_db = validation_database.ValidationDatabase(
                validation_db['path'], must_exist=True)
            validation_db['val_sample_count'] = val_db.count
    except:
        validation_db['val_sample_count'] = -1
        validation_db['path'] = None
    return validation_db
Ejemplo n.º 4
0
  def __init__(self,
               mode,
               url,
               tokenizer,
               seen_in_training,
               original_input,
               input_ids,
               input_mask,
               masked_lm_positions,
               masked_lm_ids,
               masked_lm_weights,
               masked_lm_lengths,
               next_sentence_labels,
               masked_lm_predictions,
               next_sentence_predictions,
               ):
    """
    Initialize writeValidationDB
    Stores input, target predictions, actual predictions, positions, step
    during validation to database.

    Args:
      All input and output tensors for each single validation step.
    """
    super(writeValidationDB, self).__init__(mode)

    self.tokenizer                  = tokenizer
    self.val_db                    = validation_database.ValidationDatabase("sqlite:///{}".format(url))
    self.val_id                    = self.val_db.count

    self.seen_in_training          = seen_in_training
    self.original_input            = original_input
    self.input_ids                 = input_ids
    self.input_mask                = input_mask
    self.masked_lm_positions       = masked_lm_positions
    self.masked_lm_ids             = masked_lm_ids
    self.masked_lm_weights         = masked_lm_weights
    self.masked_lm_lengths         = masked_lm_lengths
    self.next_sentence_labels      = next_sentence_labels
    self.masked_lm_predictions     = masked_lm_predictions
    self.next_sentence_predictions = next_sentence_predictions
    return
Ejemplo n.º 5
0
def validation_samples(workspace: str, model_sha: str):
    global data
    global cached_models
    if data == {}:
        data = parseData()

    target_sha = crypto.sha256_str(str(workspace) + model_sha)
    current_model = cached_models[target_sha]
    validation = current_model['validation']

    if validation['path']:

        val_db = validation_database.ValidationDatabase(str(
            validation['path']),
                                                        must_exist=True)
        with val_db.Session() as session:
            validation['val_samples'] = session.query(
                validation_database.BERTValFile).all()
            validation['val_metrics'] = session.query(
                validation_database.ValResults).all()
            # random.shuffle(validation['val_samples'])

        for sample in validation['val_samples']:
            processed_input_ids = []
            if '[HOLE]' in sample.input_ids:
                mask_type = '[HOLE]'
            elif '[MASK]' in sample.input_ids:
                mask_type = '[MASK]'
            else:
                mask_type = ''

            if mask_type == '[HOLE]':
                input_ids = sample.input_ids.split(mask_type)
                mask_num = sample.num_targets
                for i in range(mask_num):
                    processed_input_ids += [
                        {
                            'text': input_ids[i],
                            'color': 'plain',
                            'length': len(input_ids[i]),
                        },
                        {
                            'text': mask_type,
                            'color': 'hole',
                            'length':
                            int(sample.masked_lm_lengths.split(',')[i]),
                        },
                        {
                            'text':
                            sample.masked_lm_predictions.split('\n')
                            [i].replace(' ', '[ ]').replace('\n', '\\n'),
                            'color':
                            'prediction',
                            'length':
                            1,
                        },
                        {
                            'text':
                            sample.masked_lm_ids.split('\n')[i].replace(
                                ' ', '[ ]').replace('\n', '\\n'),
                            'color':
                            'target',
                            'length':
                            1,
                        },
                    ]
                while i < len(input_ids) - 1:
                    i += 1
                    processed_input_ids.append(
                        {
                            'text': input_ids[i],
                            'color': 'plain',
                            'length': len(input_ids[i]),
                        }, )

            elif mask_type == '[MASK]':
                processed_input_ids = [{
                    'text': sample.input_ids,
                    'color': 'plain',
                }]

            sample.input_ids = processed_input_ids
    validation['summary'] = current_model['summary']
    validation['workspace'] = workspace
    validation['model_sha'] = model_sha
    return flask.render_template("validation_samples.html",
                                 data=validation,
                                 **GetBaseTemplateArgs())