def __init__(self, masked_lm_prob = 0.15, mask_token_prob = 0.80, resample_token_prob = 0.50, deterministic_n_lm_tokens = True, **kwargs): super().__init__(**kwargs) self._masked_lm_prob = masked_lm_prob self._mask_token_prob = mask_token_prob self._resample_token_prob = resample_token_prob self._deterministic_n_lm_tokens = deterministic_n_lm_tokens # tf.where-based "branching" for BERT's Cloze task self._branch1_sampler = ( tfp.distributions.Uniform() if self._deterministic_n_lm_tokens else tfp.distributions.Bernoulli( probs=self._masked_lm_prob, dtype=tf.bool) ) self._branch2_sampler = tfp.distributions.Bernoulli( probs=self._mask_token_prob, dtype=tf.bool) self._branch3_sampler = tfp.distributions.Bernoulli( probs=self._resample_token_prob, dtype=tf.bool) # Resample (integer-valued) tokens uniformly at random, ignoring any special # tokens in the vocabulary self._resample_sampler = vocabulary.Sampler(self._vocab)
def make_fake_sequence_dataset(num_examples = 1000): voc = vocabulary.alternative sampler = vocabulary.Sampler(voc) ds = tf.data.Dataset.from_tensor_slices({ 'sequence': sampler.sample((num_examples, 128)), 'seq_key': tf.range(num_examples, dtype=tf.int32), 'fam_key': tf.range(num_examples, 2 * num_examples, dtype=tf.int32), }) return ds
def make_fake_homology_dataset(num_examples=1000, seq_len=128): voc = vocabulary.proteins sampler = vocabulary.Sampler(voc) return tf.data.Dataset.from_tensor_slices({ 'sequence': sampler.sample((num_examples, seq_len)), 'target': tf.random.uniform(shape=(num_examples, )) > 0.8, 'weights': tf.ones(shape=(num_examples, seq_len), dtype=tf.float32), })
def __init__(self, max_len=512, len_increase_ratio=2.0, logits=None, gap_token='-', **kwargs): super().__init__(**kwargs) self._max_len = max_len self._len_increase_ratio = len_increase_ratio self._sampler = vocabulary.Sampler( vocab=self._vocab, logits=self.PFAM_LOGITS if logits is None else logits) self._gap_token = gap_token self._gap_code = self._vocab.get(self._gap_token)
def __init__(self, max_len = 512, tau = 0.01, alpha = 0.05, eta = 0.7, vocab = None): self._max_len = max_len vocab = vocabulary.get_default() if vocab is None else vocab self._sampler = vocabulary.Sampler(vocab=vocab) self._eos = vocab.get(vocab.specials[-1]) self._pad = vocab.padding_code # Transition look-up table (excluding special initial transition). look_up = { (self.MATCH, self.MATCH): 1, (self.GAP_IN_X, self.MATCH): 2, (self.GAP_IN_Y, self.MATCH): 3, (self.MATCH, self.GAP_IN_X): 4, (self.GAP_IN_X, self.GAP_IN_X): 5, (self.GAP_IN_Y, self.GAP_IN_X): 9, # "forbidden" transition. (self.MATCH, self.GAP_IN_Y): 6, (self.GAP_IN_X, self.GAP_IN_Y): 7, (self.GAP_IN_Y, self.GAP_IN_Y): 8, } # Builds data structures for efficiently encoding transitions. self._hash_fn = lambda d0, d1: 3 * (d1 + 1) + (d0 + 1) hashes = [self._hash_fn(d0, d1) for (d0, d1) in look_up] trans_encoder = tf.scatter_nd( indices=[[x] for x in hashes], updates=list(look_up.values()), shape=[max(hashes) + 1]) self._trans_encoder = tf.cast(trans_encoder, tf.int32) self._init_trans = tf.convert_to_tensor([self.INIT_TRANS], dtype=tf.int32) cond_probs = tf.convert_to_tensor( [[0.0, 1.0, 0.0, 0.0, 0.0], [0.0, 1.0 - 2.0 * alpha - tau, alpha, alpha, tau], [0.0, eta, 1.0 - eta - alpha, alpha, 0.0], [0.0, eta, 0.0, 1.0 - eta, 0.0], [0.0, 0.0, 0.0, 0.0, 1.0]], tf.float32) self._logits = tf.where(cond_probs > 0.0, tf.math.log(cond_probs), -np.inf) self._delta_len_x = tf.convert_to_tensor([0, 1, 0, 1, 0]) self._delta_len_y = tf.convert_to_tensor([0, 1, 1, 0, 0])
def make_fake_dataset(num_examples=1000): voc = vocabulary.proteins sampler = vocabulary.Sampler(voc) ds = tf.data.Dataset.from_tensor_slices(sampler.sample( (num_examples, 128))) return ds.map(lambda x: {'sequence': x})
def setUp(self): super().setUp() gin.clear_config() tf.random.set_seed(0) self.sampler = vocabulary.Sampler() self.seq = self.sampler.sample((256, ))
def setUp(self): super().setUp() tf.random.set_seed(0) self.vocab = vocabulary.alternative self.sampler = vocabulary.Sampler(vocab=self.vocab)