def generate_query_embeddings(model, instances_list): batch_size = 100 embeddings_list = list() print('Computing embeddings for %s questions' % len(instances_list)) slices = zip(*(iter(instances_list), ) * batch_size) num_batches = int(len(instances_list) / batch_size) for n, s in enumerate(slices): output.clear(output_tags='progress') with output.use_tags('progress'): print('Processing batch %s of %s' % (n + 1, num_batches)) question_batch = list( [question_dict["question"] for question_dict in s]) encodings = model.signatures['question_encoder']( tf.constant(question_batch)) for i in range(len(question_batch)): embeddings_list.append(np.array(encodings['outputs'][i])) if num_batches * batch_size < len(instances_list): question_batch = list([ question_dict["question"] for question_dict in instances_list[num_batches * batch_size:] ]) encodings = model.signatures['question_encoder']( tf.constant(question_batch)) for i in range(len(question_batch)): embeddings_list.append(np.array(encodings['outputs'][i])) return np.array(embeddings_list)
def _active_component(self, component_id): """Sets active subcomponent.""" if not self._published: self._publish() if self._current_component is not None: raise WidgetException('Already inside a component') self._current_component = component_id _util.flush_all() with self._output_in_widget(): with output.use_tags(self._current_component): with output.redirect_to_element('#' + component_id): with output.use_tags('user_output'): try: yield finally: _util.flush_all() self._current_component = None
def run_predictions(eval_set, out_filename, n_examples=None, padding=False): """ Automatic prediction script with options for checkpointing, error coverage, padding of incomplete experiments and filedumping. """ predictions = dict() checkpoints = [] error_count = 0 for k in range(1000, 11000, 1000): # checkpoints.append(k) pass if not n_examples: for i, example in enumerate(eval_set): try: predictions[example.qas_id] = answer_question(example.question_text, example.context_text) except: predictions[example.qas_id] = "" error_count += 1 output.clear('status_text') with output.use_tags('status_text'): print(f"Loaded {i+1}/{len(eval_set)} Failed predictions: {error_count}") if i in checkpoints: with open('/content/gdrive/My Drive/squad/'+str(int(i/1000))+'k_'+out_filename, 'w') as fp: json.dump(predictions, fp) print(f"Saved checkpoint {i/1000}k") else: for i in range(n_examples): example = eval_set[i] predictions[example.qas_id] = answer_question(example.question_text, example.context_text) output.clear('status_text') with output.use_tags('status_text'): print(f"Loaded {i+1}/{n_examples}") if padding: for i in range(n_examples, len(eval_set), 1): example = eval_set[i] predictions[example.qas_id] = "" output.clear('status_text') with output.use_tags('status_text'): print(f"Loaded {i+1}/{len(eval_set)}") output.clear('status_text') with open('/content/gdrive/My Drive/squad/'+out_filename, 'w') as fp: json.dump(predictions, fp)
def generate_document_embeddings(model, response_list, sent_list, doc_list): ''' :param model: the use-QA model :param response_list: a tuple (sent_id, doc_id) :param sent_list: a list of strings :param doc_list: a list of strings :return: ''' batch_size = 100 embeddings_list = list() print('Computing embeddings for %s sentences' % len(response_list)) slices = zip(*(iter(response_list), ) * batch_size) num_batches = int(len(response_list) / batch_size) for n, s in enumerate(slices): output.clear(output_tags='progress') with output.use_tags('progress'): print('Processing batch %s of %s' % (n + 1, num_batches)) response_batch = list( [sent_list[int(sent_id)] for sent_id, doc_id in s]) context_batch = list([doc_list[int(doc_id)] for sent_id, doc_id in s]) encodings = model.signatures['response_encoder']( input=tf.constant(response_batch), context=tf.constant(context_batch)) for i in range(len(response_batch)): embeddings_list.append(np.array(encodings['outputs'][i])) if batch_size * num_batches < len(response_list): response_batch = list([ sent_list[int(sent_id)] for sent_id, doc_id in response_list[num_batches * batch_size:] ]) context_batch = list([ doc_list[int(doc_id)] for sent_id, doc_id in response_list[num_batches * batch_size:] ]) encodings = model.signatures['response_encoder']( input=tf.constant(response_batch), context=tf.constant(context_batch)) for i in range(len(response_batch)): embeddings_list.append(np.array(encodings['outputs'][i])) return np.array(embeddings_list)
def generate_document_embeddings_no_context(model, response_list): ''' :param model: the use-QA model :param response_list: a list of strings :return: ''' batch_size = 100 embeddings_list = list() print('Computing embeddings for %s sentences' % len(response_list)) slices = zip(*(iter(response_list), ) * batch_size) num_batches = int(len(response_list) / batch_size) for n, s in enumerate(slices): output.clear(output_tags='progress') with output.use_tags('progress'): print('Processing batch %s of %s' % (n + 1, num_batches)) # according to https://tfhub.dev/google/universal-sentence-encoder-qa/3, we should repeat answer_batch if there is no context. answer_batch = list([sent for sent in s]) #context_batch = list([" "]) encodings = model.signatures['response_encoder']( input=tf.constant(answer_batch), context=tf.constant(answer_batch)) for i in range(len(answer_batch)): embeddings_list.append(np.array(encodings['outputs'][i])) if batch_size * num_batches < len(response_list): answer_batch = list( [sent for sent in response_list[num_batches * batch_size:]]) encodings = model.signatures['response_encoder']( input=tf.constant(answer_batch), context=tf.constant(answer_batch)) for i in range(len(answer_batch)): embeddings_list.append(np.array(encodings['outputs'][i])) return np.array(embeddings_list)
def _output_in_widget(self): with output.use_tags(self._output_tags): try: yield finally: _util.flush_all()
# Prompt appearing here means something went wrong with the drive binary. d.kill(_signal.SIGKILL) extra_reason = '' if 0 == _subprocess.call('grep -q "{}" "{}"'.format( timeout_pattern, dfs_log), shell=True): extra_reason = ( ': timeout during initial read of root folder; for more info: ' 'https://research.google.com/colaboratory/faq.html#drive-timeout' ) raise ValueError('mount failed' + extra_reason) elif case == 2: # Not already authorized, so do the authorization dance. auth_prompt = d.match.group( 1) + '\nEnter your authorization code:\n' with _output.use_tags('dfs-auth-dance'): with open(fifo, 'w') as fifo_file: fifo_file.write(get_code(auth_prompt) + '\n') wrote_to_fifo = True elif case == 5: raise ValueError('mount failed: invalid oauth code') if not wrote_to_fifo: with open(fifo, 'w') as fifo_file: fifo_file.write('ignored\n') filtered_logfile = _timeouts_path() d.sendline('fuser -kw "{f}" ; rm -rf "{f}"'.format(f=filtered_logfile)) d.expect(prompt) filter_script = _os.path.join(drive_dir, 'drive-filter.py') filter_cmd = ( """nohup bash -c 'tail -n +0 -F "{}" | """ """python3 {} > "{}" ' < /dev/null > /dev/null 2>&1 &""").format(
def display(self): with output.use_tags([self._tag]): display(self)
def main(): window_size = 6 batch_size = 16 model_name = 'skipgram-win12' ckpt_dir = '/content/gdrive/My Drive/files/{0}/{0}'.format(model_name) ckpt_dir = ckpt_dir + '-ep{}.ckpt' text, dictionary = util.nltk2data(brown, save_dict=False, remove_punc=True) train_inps, train_labels, eval_inps, eval_labels = util.data_dicer( text, dictionary, window_size, batch_size, chopoff=True, train_eval_split=0.8) # TODO: placeholders for input and output (label) vocab_size = len(dictionary.keys()) embed_dim = 512 num_sampled = 10 num_true = 2 * window_size epochs = 1 current_epoch = 1 x_ph = tf.placeholder(tf.int32, shape=(None, ), name='x_ph') target_ph = tf.placeholder(tf.int32, shape=(None, num_true), name='target_ph') # TODO: construct embedding_layers USING VARIABLE SCOPE! with tf.variable_scope('skipgram', reuse=tf.AUTO_REUSE): embedding_layer = tf.get_variable('embedding_layer', shape=(vocab_size, embed_dim), dtype=tf.float32, initializer=tf.constant_initializer( np.random.randn( vocab_size, embed_dim))) output_weights = tf.get_variable('output_weight', shape=(vocab_size, embed_dim), dtype=tf.float32, initializer=tf.constant_initializer( np.random.randn( vocab_size, embed_dim))) output_biases = tf.get_variable('output_bias', shape=(vocab_size, ), dtype=tf.float32, initializer=tf.zeros_initializer()) # TODO: map the word using tf.nn.lookup # center_word shape = [batch_size, embed_dim] center_word = tf.nn.embedding_lookup(embedding_layer, x_ph) # TODO: Check which mode, either training or eval # TODO: Training, calculate NCE loss train_loss = tf.nn.nce_loss(weights=output_weights, biases=output_biases, labels=target_ph, inputs=center_word, num_true=num_true, num_sampled=num_sampled, num_classes=vocab_size) train_batch_loss = tf.reduce_mean(train_loss) opt = tf.train.AdamOptimizer().minimize(train_batch_loss) # TODO: eval, calculate manually? refers to tensorflow guide. # output_weights.shape = [vocab_size, dim] # center_word.shape = [batch_size, dim] # matmul, output_weights # projections = tf.matmul(center_word, tf.transpose(output_weights)) + output_biases # eval_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=projections, labels=tf.squeeze(target_ph)) # eval_batch_loss = tf.reduce_mean(eval_loss) saver = tf.train.Saver(tf.trainable_variables()) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if current_epoch > 1: saver.restore(sess, ckpt_dir.format(current_epoch - 1)) start_time = time.time() print("=" * 80) print("=" * 80) print("Start time: {}".format( time.strftime('%Y/%m/%d, %H:%M%S', time.localtime(start_time)))) print("=" * 80) print("=" * 80) for i in range(epochs): epoch_start_time = time.time() print("Epoch: ", i + 1) epoch_loss = [] # Do it over batches # 1. stats batch_number = 0 seconds_per_batch = 0 total_batch = len(train_inps) for batch_inps, batch_labels in zip(train_inps, train_labels): batch_start_time = time.time() batch_number = batch_number + 1 left_over_time = round( (total_batch + 1 - batch_number) * seconds_per_batch, 3) output.clear(output_tags=('batch_print'), wait=True) with output.use_tags('batch_print'): print( "Current batch:{}/{}\tSeconds per batch: {}s, {}s left." .format(batch_number, total_batch + 1, seconds_per_batch, left_over_time)) feed_dict = {x_ph: batch_inps, target_ph: batch_labels} _, batch_loss_v = sess.run([opt, train_batch_loss], feed_dict=feed_dict) epoch_loss.append(batch_loss_v) batch_end_time = time.time() seconds_per_batch = round(batch_end_time - batch_start_time, 3) # TODO: averaging epoch_loss feed_dict = {x_ph: eval_inps, target_ph: eval_labels} [eval_loss_v] = sess.run([train_batch_loss], feed_dict=feed_dict) epoch_loss = np.mean(epoch_loss) # TODO: print train loss, print eval loss epoch_end_time = time.time() epoch_duration = epoch_end_time - epoch_start_time print("Epochs took: {}s".format(round(epoch_duration, 3))) print("Train Loss: {}\tEval Loss: {}".format( round(epoch_loss, 3), round(eval_loss_v, 3))) end_time = time.time() duration = end_time - start_time print("=" * 80) print("=" * 80) print("End time: {}".format( time.strftime('%Y/%m/%d, %H:%M%S', time.localtime(end_time)))) print("Duration: {}s".format(round(duration, 2))) print("=" * 80) print("=" * 80) saver.save(sess, ckpt_dir.format(current_epoch))
import tensorflow as tf from .preprocessing import load_conversations, tokenize_and_filter from .inference import predict_greedy, predict_beam from .model import transformer, CustomSchedule, loss_function, accuracy from .params import * from ..components import save_obj, load_obj, make_tokenizer, train logging.basicConfig(level=logging.INFO) tf.random.set_seed(42) tf.keras.backend.clear_session() IS_TPU = False if os.environ.get('IS_COLAB', False): from google.colab import output try: with output.use_tags('setup'): # !pip install convokit # !python3 -m spacy download en tpu = tf.distribute.cluster_resolver.TPUClusterResolver( ) # TPU detection print('Running ovariable_namen TPU ', tpu.cluster_spec().as_dict()['worker']) tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initialize_tpu_system(tpu) tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu) IS_TPU = True output.clear(output_tags='setup') except ValueError: logging.info('Not connected to a TPU runtime')
if val > 1 - SKIP_PERCANTAGE: continue # Get the tokens and remove words that are too short tokens = [] try: tokens = row['cleaned_Data_Content'].split(' ') tokens = [token for token in tokens if len(token) > 3] except: print('A book failed') books.append(tokens) genres.append(genre) output.clear('status_text') with output.use_tags('status_text'): print('Books loaded: ' + str(count)) count += 1 genres = np.array(genres) books = np.array(books) # Additional data processing to optimize the topics generated commonWords = [] with open("/content/drive/My Drive/ATIML/common.txt") as common: for word in common: commonWords.append(word[:-1]) forbidden = [
<h1 id='colorized' class='font-effect-3d'>"""+string+"""</h1> <script> var tc=setInterval(function(){ var now=new Date().getTime(); var iddoc=document.getElementById('colorized'); iddoc.style.color=d3.interpolate"""+cmap+"""((now%60000)/60000);},1) </script>""" display(HTML(html_str)) # %cmap_header CODE MODULES & HELPFUL TOOLS|24|Ewert|Turbo """#✒️ Tagged Outputs""" import sys,time print('The process is starting') with output.use_tags('tagged_outputs'): for el in ['working \n','=> => => \n','still working \n']: sys.stdout.write('working \n') sys.stdout.flush(); time.sleep(5) output.clear(output_tags='tagged_outputs') print('Outputs have cleared') """#✒️ Linked Outputs""" # Commented out IPython magic to ensure Python compatibility. # %%javascript # const listener=new BroadcastChannel('channel1'); # listener.onmessage=(msg)=>{ # const div=document.createElement('div'); # div.textContent=msg.data; # div.style.border='double white';