Exemple #1
0
def generate_query_embeddings(model, instances_list):
    batch_size = 100

    embeddings_list = list()

    print('Computing embeddings for %s questions' % len(instances_list))
    slices = zip(*(iter(instances_list), ) * batch_size)
    num_batches = int(len(instances_list) / batch_size)
    for n, s in enumerate(slices):
        output.clear(output_tags='progress')
        with output.use_tags('progress'):
            print('Processing batch %s of %s' % (n + 1, num_batches))

        question_batch = list(
            [question_dict["question"] for question_dict in s])
        encodings = model.signatures['question_encoder'](
            tf.constant(question_batch))
        for i in range(len(question_batch)):
            embeddings_list.append(np.array(encodings['outputs'][i]))

    if num_batches * batch_size < len(instances_list):
        question_batch = list([
            question_dict["question"]
            for question_dict in instances_list[num_batches * batch_size:]
        ])
        encodings = model.signatures['question_encoder'](
            tf.constant(question_batch))
        for i in range(len(question_batch)):
            embeddings_list.append(np.array(encodings['outputs'][i]))

    return np.array(embeddings_list)
Exemple #2
0
 def _active_component(self, component_id):
     """Sets active subcomponent."""
     if not self._published:
         self._publish()
     if self._current_component is not None:
         raise WidgetException('Already inside a component')
     self._current_component = component_id
     _util.flush_all()
     with self._output_in_widget():
         with output.use_tags(self._current_component):
             with output.redirect_to_element('#' + component_id):
                 with output.use_tags('user_output'):
                     try:
                         yield
                     finally:
                         _util.flush_all()
                         self._current_component = None
Exemple #3
0
def run_predictions(eval_set, out_filename, n_examples=None, padding=False):
  """
  Automatic prediction script with options for checkpointing, error coverage,
  padding of incomplete experiments and filedumping.
  """
  predictions = dict()
  checkpoints = []
  error_count = 0
  for k in range(1000, 11000, 1000):
    # checkpoints.append(k)
    pass

  if not n_examples:
    for i, example in enumerate(eval_set):
      try:
        predictions[example.qas_id] = answer_question(example.question_text, example.context_text)
      except:
        predictions[example.qas_id] = ""
        error_count += 1
      output.clear('status_text')
      with output.use_tags('status_text'):
        print(f"Loaded {i+1}/{len(eval_set)} Failed predictions: {error_count}")
      if i in checkpoints:
        with open('/content/gdrive/My Drive/squad/'+str(int(i/1000))+'k_'+out_filename, 'w') as fp:
          json.dump(predictions, fp)
        print(f"Saved checkpoint {i/1000}k")
  else:
    for i in range(n_examples):
      example = eval_set[i]
      predictions[example.qas_id] = answer_question(example.question_text, example.context_text)
      output.clear('status_text')
      with output.use_tags('status_text'):
        print(f"Loaded {i+1}/{n_examples}")
        
    if padding:
      for i in range(n_examples, len(eval_set), 1):
        example = eval_set[i]
        predictions[example.qas_id] = ""
        output.clear('status_text')
        with output.use_tags('status_text'):
          print(f"Loaded {i+1}/{len(eval_set)}")

  output.clear('status_text')

  with open('/content/gdrive/My Drive/squad/'+out_filename, 'w') as fp:
      json.dump(predictions, fp)
Exemple #4
0
def generate_document_embeddings(model, response_list, sent_list, doc_list):
    '''

    :param model: the use-QA model
    :param response_list: a tuple (sent_id, doc_id)
    :param sent_list: a list of strings
    :param doc_list: a list of strings
    :return:
    '''

    batch_size = 100

    embeddings_list = list()

    print('Computing embeddings for %s sentences' % len(response_list))
    slices = zip(*(iter(response_list), ) * batch_size)
    num_batches = int(len(response_list) / batch_size)
    for n, s in enumerate(slices):
        output.clear(output_tags='progress')
        with output.use_tags('progress'):
            print('Processing batch %s of %s' % (n + 1, num_batches))

        response_batch = list(
            [sent_list[int(sent_id)] for sent_id, doc_id in s])
        context_batch = list([doc_list[int(doc_id)] for sent_id, doc_id in s])
        encodings = model.signatures['response_encoder'](
            input=tf.constant(response_batch),
            context=tf.constant(context_batch))
        for i in range(len(response_batch)):
            embeddings_list.append(np.array(encodings['outputs'][i]))

    if batch_size * num_batches < len(response_list):
        response_batch = list([
            sent_list[int(sent_id)]
            for sent_id, doc_id in response_list[num_batches * batch_size:]
        ])
        context_batch = list([
            doc_list[int(doc_id)]
            for sent_id, doc_id in response_list[num_batches * batch_size:]
        ])
        encodings = model.signatures['response_encoder'](
            input=tf.constant(response_batch),
            context=tf.constant(context_batch))
        for i in range(len(response_batch)):
            embeddings_list.append(np.array(encodings['outputs'][i]))

    return np.array(embeddings_list)
Exemple #5
0
def generate_document_embeddings_no_context(model, response_list):
    '''
    :param model: the use-QA model
    :param response_list: a list of strings
    :return:
    '''

    batch_size = 100

    embeddings_list = list()

    print('Computing embeddings for %s sentences' % len(response_list))
    slices = zip(*(iter(response_list), ) * batch_size)
    num_batches = int(len(response_list) / batch_size)
    for n, s in enumerate(slices):
        output.clear(output_tags='progress')
        with output.use_tags('progress'):
            print('Processing batch %s of %s' % (n + 1, num_batches))

        # according to https://tfhub.dev/google/universal-sentence-encoder-qa/3, we should repeat answer_batch if there is no context.
        answer_batch = list([sent for sent in s])
        #context_batch = list([" "])
        encodings = model.signatures['response_encoder'](
            input=tf.constant(answer_batch), context=tf.constant(answer_batch))
        for i in range(len(answer_batch)):
            embeddings_list.append(np.array(encodings['outputs'][i]))

    if batch_size * num_batches < len(response_list):
        answer_batch = list(
            [sent for sent in response_list[num_batches * batch_size:]])
        encodings = model.signatures['response_encoder'](
            input=tf.constant(answer_batch), context=tf.constant(answer_batch))
        for i in range(len(answer_batch)):
            embeddings_list.append(np.array(encodings['outputs'][i]))

    return np.array(embeddings_list)
Exemple #6
0
 def _output_in_widget(self):
     with output.use_tags(self._output_tags):
         try:
             yield
         finally:
             _util.flush_all()
Exemple #7
0
         # Prompt appearing here means something went wrong with the drive binary.
         d.kill(_signal.SIGKILL)
         extra_reason = ''
         if 0 == _subprocess.call('grep -q "{}" "{}"'.format(
                 timeout_pattern, dfs_log),
                                  shell=True):
             extra_reason = (
                 ': timeout during initial read of root folder; for more info: '
                 'https://research.google.com/colaboratory/faq.html#drive-timeout'
             )
         raise ValueError('mount failed' + extra_reason)
     elif case == 2:
         # Not already authorized, so do the authorization dance.
         auth_prompt = d.match.group(
             1) + '\nEnter your authorization code:\n'
         with _output.use_tags('dfs-auth-dance'):
             with open(fifo, 'w') as fifo_file:
                 fifo_file.write(get_code(auth_prompt) + '\n')
         wrote_to_fifo = True
     elif case == 5:
         raise ValueError('mount failed: invalid oauth code')
 if not wrote_to_fifo:
     with open(fifo, 'w') as fifo_file:
         fifo_file.write('ignored\n')
 filtered_logfile = _timeouts_path()
 d.sendline('fuser -kw "{f}" ; rm -rf "{f}"'.format(f=filtered_logfile))
 d.expect(prompt)
 filter_script = _os.path.join(drive_dir, 'drive-filter.py')
 filter_cmd = (
     """nohup bash -c 'tail -n +0 -F "{}" | """
     """python3 {} > "{}" ' < /dev/null > /dev/null 2>&1 &""").format(
Exemple #8
0
 def display(self):
     with output.use_tags([self._tag]):
         display(self)
Exemple #9
0
def main():

    window_size = 6
    batch_size = 16
    model_name = 'skipgram-win12'
    ckpt_dir = '/content/gdrive/My Drive/files/{0}/{0}'.format(model_name)
    ckpt_dir = ckpt_dir + '-ep{}.ckpt'

    text, dictionary = util.nltk2data(brown, save_dict=False, remove_punc=True)
    train_inps, train_labels, eval_inps, eval_labels = util.data_dicer(
        text,
        dictionary,
        window_size,
        batch_size,
        chopoff=True,
        train_eval_split=0.8)
    # TODO: placeholders for input and output (label)
    vocab_size = len(dictionary.keys())
    embed_dim = 512
    num_sampled = 10
    num_true = 2 * window_size
    epochs = 1
    current_epoch = 1

    x_ph = tf.placeholder(tf.int32, shape=(None, ), name='x_ph')
    target_ph = tf.placeholder(tf.int32,
                               shape=(None, num_true),
                               name='target_ph')

    # TODO: construct embedding_layers USING VARIABLE SCOPE!
    with tf.variable_scope('skipgram', reuse=tf.AUTO_REUSE):
        embedding_layer = tf.get_variable('embedding_layer',
                                          shape=(vocab_size, embed_dim),
                                          dtype=tf.float32,
                                          initializer=tf.constant_initializer(
                                              np.random.randn(
                                                  vocab_size, embed_dim)))

        output_weights = tf.get_variable('output_weight',
                                         shape=(vocab_size, embed_dim),
                                         dtype=tf.float32,
                                         initializer=tf.constant_initializer(
                                             np.random.randn(
                                                 vocab_size, embed_dim)))

        output_biases = tf.get_variable('output_bias',
                                        shape=(vocab_size, ),
                                        dtype=tf.float32,
                                        initializer=tf.zeros_initializer())

    # TODO: map the word using tf.nn.lookup
    # center_word shape = [batch_size, embed_dim]
    center_word = tf.nn.embedding_lookup(embedding_layer, x_ph)

    # TODO: Check which mode, either training or eval
    # TODO: Training, calculate NCE loss
    train_loss = tf.nn.nce_loss(weights=output_weights,
                                biases=output_biases,
                                labels=target_ph,
                                inputs=center_word,
                                num_true=num_true,
                                num_sampled=num_sampled,
                                num_classes=vocab_size)
    train_batch_loss = tf.reduce_mean(train_loss)
    opt = tf.train.AdamOptimizer().minimize(train_batch_loss)

    # TODO: eval, calculate manually? refers to tensorflow guide.
    # output_weights.shape = [vocab_size, dim]
    # center_word.shape = [batch_size, dim]
    # matmul, output_weights

    # projections = tf.matmul(center_word, tf.transpose(output_weights)) + output_biases
    # eval_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=projections, labels=tf.squeeze(target_ph))
    # eval_batch_loss = tf.reduce_mean(eval_loss)

    saver = tf.train.Saver(tf.trainable_variables())

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        if current_epoch > 1:
            saver.restore(sess, ckpt_dir.format(current_epoch - 1))
        start_time = time.time()
        print("=" * 80)
        print("=" * 80)
        print("Start time: {}".format(
            time.strftime('%Y/%m/%d, %H:%M%S', time.localtime(start_time))))
        print("=" * 80)
        print("=" * 80)
        for i in range(epochs):
            epoch_start_time = time.time()
            print("Epoch: ", i + 1)

            epoch_loss = []
            # Do it over batches
            # 1. stats
            batch_number = 0
            seconds_per_batch = 0
            total_batch = len(train_inps)
            for batch_inps, batch_labels in zip(train_inps, train_labels):
                batch_start_time = time.time()
                batch_number = batch_number + 1
                left_over_time = round(
                    (total_batch + 1 - batch_number) * seconds_per_batch, 3)
                output.clear(output_tags=('batch_print'), wait=True)
                with output.use_tags('batch_print'):
                    print(
                        "Current batch:{}/{}\tSeconds per batch: {}s, {}s left."
                        .format(batch_number, total_batch + 1,
                                seconds_per_batch, left_over_time))

                feed_dict = {x_ph: batch_inps, target_ph: batch_labels}
                _, batch_loss_v = sess.run([opt, train_batch_loss],
                                           feed_dict=feed_dict)
                epoch_loss.append(batch_loss_v)
                batch_end_time = time.time()
                seconds_per_batch = round(batch_end_time - batch_start_time, 3)
                # TODO: averaging epoch_loss
            feed_dict = {x_ph: eval_inps, target_ph: eval_labels}
            [eval_loss_v] = sess.run([train_batch_loss], feed_dict=feed_dict)

            epoch_loss = np.mean(epoch_loss)
            # TODO: print train loss, print eval loss
            epoch_end_time = time.time()
            epoch_duration = epoch_end_time - epoch_start_time
            print("Epochs took: {}s".format(round(epoch_duration, 3)))
            print("Train Loss: {}\tEval Loss: {}".format(
                round(epoch_loss, 3), round(eval_loss_v, 3)))

        end_time = time.time()
        duration = end_time - start_time
        print("=" * 80)
        print("=" * 80)
        print("End time: {}".format(
            time.strftime('%Y/%m/%d, %H:%M%S', time.localtime(end_time))))
        print("Duration: {}s".format(round(duration, 2)))
        print("=" * 80)
        print("=" * 80)
        saver.save(sess, ckpt_dir.format(current_epoch))
Exemple #10
0
import tensorflow as tf
from .preprocessing import load_conversations, tokenize_and_filter
from .inference import predict_greedy, predict_beam
from .model import transformer, CustomSchedule, loss_function, accuracy
from .params import *
from ..components import save_obj, load_obj, make_tokenizer, train

logging.basicConfig(level=logging.INFO)
tf.random.set_seed(42)
tf.keras.backend.clear_session()

IS_TPU = False
if os.environ.get('IS_COLAB', False):
    from google.colab import output
    try:
        with output.use_tags('setup'):
            #  !pip install convokit
            #  !python3 -m spacy download en
            tpu = tf.distribute.cluster_resolver.TPUClusterResolver(
            )  # TPU detection
            print('Running ovariable_namen TPU ',
                  tpu.cluster_spec().as_dict()['worker'])
            tf.config.experimental_connect_to_cluster(tpu)
            tf.tpu.experimental.initialize_tpu_system(tpu)
            tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
            IS_TPU = True
        output.clear(output_tags='setup')

    except ValueError:
        logging.info('Not connected to a TPU runtime')
            if val > 1 - SKIP_PERCANTAGE:
                continue

    # Get the tokens and remove words that are too short
    tokens = []
    try:
        tokens = row['cleaned_Data_Content'].split(' ')
        tokens = [token for token in tokens if len(token) > 3]
    except:
        print('A book failed')

    books.append(tokens)
    genres.append(genre)

    output.clear('status_text')
    with output.use_tags('status_text'):
        print('Books loaded: ' + str(count))
    count += 1

genres = np.array(genres)
books = np.array(books)

# Additional data processing to optimize the topics generated

commonWords = []

with open("/content/drive/My Drive/ATIML/common.txt") as common:
    for word in common:
        commonWords.append(word[:-1])

forbidden = [
    <h1 id='colorized' class='font-effect-3d'>"""+string+"""</h1>
    <script>
    var tc=setInterval(function(){
        var now=new Date().getTime();
        var iddoc=document.getElementById('colorized');
        iddoc.style.color=d3.interpolate"""+cmap+"""((now%60000)/60000);},1)
    </script>"""
    display(HTML(html_str))
# %cmap_header CODE MODULES & HELPFUL TOOLS|24|Ewert|Turbo

"""#✒️ Tagged Outputs"""

import sys,time

print('The process is starting')
with output.use_tags('tagged_outputs'):
    for el in ['working \n','=> => => \n','still working \n']:
        sys.stdout.write('working \n')
        sys.stdout.flush(); time.sleep(5)
output.clear(output_tags='tagged_outputs')
print('Outputs have cleared')

"""#✒️ Linked Outputs"""

# Commented out IPython magic to ensure Python compatibility.
# %%javascript
# const listener=new BroadcastChannel('channel1');
# listener.onmessage=(msg)=>{
#   const div=document.createElement('div');
#   div.textContent=msg.data;
#   div.style.border='double white';