def _preprocess_episodes(self, episodes, dictionary, mode): """ Tokenize all the fields in Wizard-of-Wikipedia """ colorlog.info("Preprocess wizard of wikipedia dataset") tokenize = lambda x: ' '.join([str(data_vocab.BERT_CLS_ID)] + [ str(y) for y in dictionary.convert_tokens_to_ids(dictionary.tokenize(x)) ] + [str(data_vocab.BERT_SEP_ID)]) new_episodes = [] for episode_num, episode in enumerate(tqdm(episodes, ncols=70)): new_examples = [] for example_num, example in enumerate(episode): # Tokenize inputs and convert to tokens context = tokenize(example['text']) if mode == "train": response = tokenize(example['labels'][0]) else: response = tokenize(example['eval_labels'][0]) chosen_topic = tokenize(example['chosen_topic']) # Set up knowledge checked_knowledge = example[ 'title'] + ' __knowledge__ ' + example['checked_sentence'] knowledges = [checked_knowledge] + \ [k for k in example['knowledge'].rstrip().split('\n')] for idx, k in enumerate(knowledges[1:]): if k == checked_knowledge: break else: # Sometimes, knowledge does not include checked_sentnece idx = None colorlog.warning( "Knowledge does not include checked sentence.") if idx: del knowledges[idx + 1] # Tokenize knowledge knowledge_sentences = [tokenize(k) for k in knowledges] new_example = { 'context': context, 'response': response, 'chosen_topic': chosen_topic, 'knowledge_sentences': knowledge_sentences, 'episode_num': episode_num, 'example_num': example_num } new_examples.append(new_example) new_episodes.append(new_examples) if self._datapath: episodes_fname = self._get_preprocessed_fname(mode) colorlog.info(f"Cache preprocessed dataset to {episodes_fname}") with open(episodes_fname, 'w') as fp: for episode in new_episodes: fp.write(json.dumps(episode) + '\n') return new_episodes, dictionary
def average_gradients(tower_grads, skip_none=False): """ From tensorflow cifar 10 tutorial codes Calculate the average gradient for each shared variable across all towers. Note that this function provides a synchronization point across all towers. Args: tower_grads: List of lists of (gradient, variable) tuples. The outer list is over individual gradients. The inner list is over the gradient calculation for each tower. skip_none: Boolean. Whether to throw away non gradient variable or raise exception. Returns: List of pairs of (gradient, variable) where the gradient has been averaged across all towers. """ average_grads = [] for grad_and_vars in zip(*tower_grads): # Skip None gradients if grad_and_vars[0][0] is None and skip_none: colorlog.warning("%s has None gradient" % grad_and_vars[0][1].name) continue # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) grads = [] for g, _ in grad_and_vars: # Add 0 dimension to the gradients to represent the tower. expanded_g = tf.expand_dims(g, 0) # Append on a 'tower' dimension which we will average over below. grads.append(expanded_g) # Average over the 'tower' dimension. grad = tf.concat(grads, 0) grad = tf.reduce_mean(grad, 0) # Keep in mind that the Variables are redundant because they are shared # across towers. So .. we will just return the first tower's pointer to # the Variable. v = grad_and_vars[0][1] grad_and_var = (grad, v) average_grads.append(grad_and_var) return average_grads
def _create_report(self, onweb=True): section = """<div> <b>DrugID:</b> %(drugid)s</br> <b>Regression method:</b> %(method)s </br> <b>Regression, alpha parameter used:</b> %(alpha)s</br> <b>Bayes factor:</b> %(bayes)s</br> <b>Coefficient of regression (pearson):</b> %(Rp)s</br> </div> """ % self.params self.jinja['sections'].append(section) text = {} text['boxplot'] = ("This boxplot shows the %s most important features " "(based on the weights of the regression).") text['boxplot'] %= self.caller.config["boxplot_n"] text['importance'] = ("Feature with non-null weights. If empty, it" " means no feature of interests were found") text['randomness'] = ( "Here we run the regression analysis %s times and " "plot the regression value (x-axis) for the real data (blue) " "and randomising the variable to explain (red). ") text['randomness'] %= self.caller.config['randomness'] text['weights'] = ("Feature with non-null weights. If empty, it" " means no feature of interests were found") for this in ["boxplot", "randomness", "importance", "weights"]: self.params['name'] = this self.params['text'] = text[this] filename = self.caller.prefix_images + "%(name)s_%(drugid)s.png" % self.params self.params["filename"] = filename self.params['title'] = this.title() if os.path.exists(filename): section = """<div> <h2>%(title)s results</h2> <p>%(text)s</p> <img src="%(filename)s"> """ % self.params self.jinja['sections'].append(section) else: logger.warning("%s not found. Skipped" % filename)
def count_reps(pose_estimator: PoseEstimator, tts: TTS, exercises: dict, exercise: str, count_to: int): joints = exercises[exercise]['joints'] mse_threshold = exercises[exercise]['mse_threshold'] time_based = exercises[exercise]['time_based'] joint_angle_estimator = JointAngleEstimator(joints) counter = 0 countdown_thread = Thread(target=countdown, args=(3, tts)) countdown_thread.start() while countdown_thread.is_alive(): joint_angle_estimator.init(pose_estimator.get_keypoints()) if not joint_angle_estimator.is_init: logging.warning('Couldn\'t capture initial joint angles. Cancelling exercise.') return else: logging.info(f'Initial joint angles captured as {joint_angle_estimator.get_init_angles()}.') if time_based: raise NotImplementedError else: prev_mse = curr_mse = 0 thresholded_hist = deque([1] * 3, maxlen=3) thresholded_hist_lp = deque([1] * 2, maxlen=2) while counter < count_to: if pose_estimator.keypoints_available: curr_mse = joint_angle_estimator.mse(pose_estimator.get_keypoints()) curr_mse = curr_mse if curr_mse is not None else prev_mse prev_mse = curr_mse thresholded = int(curr_mse < mse_threshold) thresholded_hist.append(thresholded) if thresholded == 0: # Filterring step. thresholded = 0 if sum(thresholded_hist) == 0 else 1 thresholded_hist_lp.append(thresholded) counter += thresholded_hist_lp[-2] == 0 and thresholded_hist_lp[-1] == 1 # print(thresholded_hist, end='\r') if thresholded_hist_lp[-2] == 0 and thresholded_hist_lp[-1] == 1: tts.say(f'{counter}.')
def boxplot_pancan(self, mode, fignum=1, title_prefix=''): """Create boxplot related to the MSI factor or Tissue factor :param mode: either set to **msi** or **tissue** """ assert mode in ['tissue', 'msi', "media"] results = self._get_boxplot_data(mode) if results is None: logger.warning( "No tissue with at least 2 pos and 2 neg found (no image created)." ) return fig = pylab.figure(fignum) oldsize = fig.get_size_inches() pylab.clf() # or close ? data, names, significance = results N = len(names) if N <= 2: # msi or 2 tissues fontsize = self.fontsize elif N <= 14: fontsize = max( 4, int(self.fontsize - (N - 2.) / (self.fontsize - 4.))) else: fontsize = max(4, int(self.fontsize / 1.4)) bb = boxswarm.BoxSwarm(data, names, fontsize=fontsize) bb.xlabel = r'%s log(IC50)' % self.drug if mode == 'tissue': bb.title = 'FEATURE/Cancer-type interactions' elif mode == 'msi': bb.title = 'FEATURE/MS-instability interactions' elif mode == "media": bb.title = 'FEATURE/Media interactions' ax = bb.plot(vert=False) # get info from left axis common_ylim = ax.get_ylim() common_ticks = ax.get_yticks() self.ax = ax.twinx() self.ax.set_ylim(common_ylim) self.ax.set_yticks(common_ticks) self.ax.set_yticklabels([str(len(this)) + " " for this in data], fontsize=fontsize / 1.4) try: pylab.tight_layout() except: pass if self.savefig is True: filename = self.directory + os.sep filename += 'ODOF_{}_DRUG_{}____{}'.format(mode, self.drug, self.feature) fig.set_size_inches(14, 16) pylab.savefig(filename + '.png', bbox_inches='tight') fig.set_size_inches(oldsize) fig.canvas.draw()
.. code-block:: python failed_testcases_list = swilog.get_error_list() if failed_testcases_list != []: assert 0, "Some tests failed:/\n%s"% "/\n".join(failed_testcases_list) """ import sys try: import logging import colorlog except: import logging as colorlog colorlog.warning("install colorlog to have colors: pip install colorlog") __copyright__ = "Copyright (C) Sierra Wireless Inc." DELIMITER = "====================" LOG_FORMAT = "\n%(log_color)s%(asctime)s %(levelname)s %(message)s" DATE_FORMAT = "%H:%M:%S" default_log_colors = { "NOTSET": "white", "DEBUG": "blue", "INFO": "green", "STEP": "bold_blank", "WARNING": "bold_yellow", "ERROR": "bold_red", "CRITICAL": "bold_red",
def copy_decode(self, mixed_inputs, encoder_outputs, decoder_outputs, attention_bias, training): """ Generate softmax values of logits in the target sequence. Args: Same as decode function's arguments - mixed_inputs: input values of context and chosen knowledge. Int tensor with shape [batch_size, mixed_input_length] - encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, sentence_max_length, word_embed_size] - decoder_outputs: continuous representaiton of output sequence. float tensor with shape [batch_size, target_length - 1, word_embed_size] - attention_bias: float tensor with shape [batch_size, 1, 1, sentence_max_length] training: boolean, whether in training mode or not. Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("copy_decode"): colorlog.warning("Use pointer-generator mechanism. \ Note that output is not logit but softmax.") if training: batch_size = tf.shape(mixed_inputs)[0] # batch_size = self.hparams.batch_size else: batch_size = tf.shape(mixed_inputs)[0] * self.hparams.beam_size # batch_size = self.hparams.batch_size * self.hparams.beam_size w_q = self._copy_q_layer w_k = self._copy_k_layer w_v = self._copy_v_layer q = w_q(decoder_outputs) k = w_k(encoder_outputs) v = w_v(encoder_outputs) # Codes for multi heads attention, but not necessary. q = self.decoder_stack.layers[-1][1].layer.split_heads(q) k = self.decoder_stack.layers[-1][1].layer.split_heads(k) v = self.decoder_stack.layers[-1][1].layer.split_heads(v) depth = (self.hparams.word_embed_size // self.hparams.num_heads) q *= depth ** -0.5 a_t = tf.matmul(q, k, transpose_b=True) a_t += attention_bias # [batch_size, num_heads, target_length - 1, mixed_input_length] p_att = _float32_softmax(a_t, name="p_copy") if training: p_att = tf.nn.dropout(p_att, noise_shape=[tf.shape(p_att)[0], tf.shape(p_att)[1], 1, 1], rate=self.hparams.attention_dropout) # [batch_size, num_heads, target_length - 1, depth] hidden = tf.matmul(p_att, v) # [batch_size, target_length, word_embed_size] p_att = p_att[:,0] hidden = self.decoder_stack.layers[-1][1].layer.combine_heads(hidden) hidden = self.decoder_stack.layers[-1][1].layer.output_dense_layer(hidden) # feed forward network hidden = self.decoder_stack.layers[-1][2](hidden, training=training) hidden = self.decoder_stack.output_normalization(hidden) # [batch_size, target_length - 1, vocab_size] p_vocab = _float32_softmax(self._output_embedding(decoder_outputs, mode="linear")) # p_vocab = _float32_softmax(self._embedding(decoder_outputs, mode="linear")) # matching (p_att.shape) to (p_vocab.shape) initial_indices = tf.tile(mixed_inputs[:, tf.newaxis, :], [1, tf.shape(p_vocab)[1], 1]) i1, i2 = tf.meshgrid(tf.range(batch_size), tf.range(tf.shape(p_vocab)[1]), indexing="ij") i1 = tf.tile(i1[:, :, tf.newaxis], [1, 1, tf.shape(p_att)[2]]) i2 = tf.tile(i2[:, :, tf.newaxis], [1, 1, tf.shape(p_att)[2]]) # [batch_size, target_length - 1, mixed_input_length, 3] indices = tf.stack([i1, i2, initial_indices], axis=-1) # [batch_size, target_length - 1, vocab_size] p_att = tf.scatter_nd(indices, p_att, shape=tf.shape(p_vocab)) p_gen = self._copy_layer(hidden) # [batch_size, target_length - 1, vocab_size] p_gen = tf.tile(p_gen, [1, 1, self.hparams.vocab_size]) # [batch_size, target_length - 1, vocab_size] p_word = (1 - p_gen) * p_vocab + p_gen * p_att return p_word
def __init__(self, filename=None, empty_tissue_name="UNDEFINED"): """.. rubric:: Constructor If no file is provided, using the default file provided in the package that is made of 1001 cell lines times 680 features. :param str empty_tissue_name: if a tissue name is let empty, replace it with this string. """ # first reset the filename to the shared data (if not provided) if filename is None: from gdsctools.datasets import genomic_features filename = genomic_features # used in the header so should be ser before call to super() super(GenomicFeatures, self).__init__(filename) # FIXME Remove columns related to Drug if any. Can be removed in # the future self.df = self.df[[ x for x in self.df.columns if x.startswith('Drug_') is False ]] for this in ['Sample Name', 'SAMPLE_NAME', 'Sample_Name', 'CELL_LINE']: if this in self.df.columns: self.df.drop(this, axis=1, inplace=True) # Let us rename "COSMIC ID" into "COSMIC_ID" if needed for old, new in { 'Tissue Factor Value': 'TISSUE_FACTOR', 'MS-instability Factor Value': 'MSI_FACTOR', 'COSMIC ID': 'COSMIC_ID' }.items(): if old in self.df.columns: colorlog.warning( "'%s' column name is deprecated " % old + " since 0.9.10. Please replace with '%s'" % new, DeprecationWarning) self.df.columns = [ x.replace(old, new) for x in self.df.columns ] if "CL" in self.df.columns and "COSMID_ID" not in self.df.columns: self.df.columns = [ x.replace("CL", "COSMIC_ID") for x in self.df.columns ] # There are 3 special columns to hold the factors self._special_names = [] # If tissue factor is not provided, we create and fill it with dummies. # OTherwise, we need to change a lot in the original code in ANOVA if self.colnames.tissue not in self.df.columns: colorlog.warning( "column named '%s' not found" % self.colnames.tissue, UserWarning) self.df[self.colnames.tissue] = ['UNDEFINED'] * len(self.df) self._special_names.append(self.colnames.tissue) else: self._special_names.append(self.colnames.tissue) self.found_msi = self.colnames.msi in self.df.columns if self.found_msi is False: colorlog.warning("column named '%s' not found" % self.colnames.msi) else: self._special_names.append(self.colnames.msi) self.found_media = self.colnames.media in self.df.columns if self.found_media is False: pass #colorlog.warning("column named '%s' not found" % self.colnames.media) else: self._special_names.append(self.colnames.media) # order columns and index self._order() # self._interpret_cosmic() # self.check() self._fix_empty_tissues(empty_tissue_name)
def _interpret(self): # if there is at least one column that starts with Drug or drug or # DRUG or variant then all other columns are dropped except "COSMIC ID" # For back compatibility with data that mixes Drug identifiers and # genomic features: _cols = [str(x) for x in self.df.columns] drug_prefix = None for this in _cols: if this.startswith("Drug_"): drug_prefix = "Drug" _cols = [str(x) for x in self.df.columns] if "COSMIC ID" in _cols and self.cosmic_name not in _cols: colorlog.warning( "'COSMIC ID' column name is deprecated since " + "0.9.10. Please replace with 'COSMIC_ID'", DeprecationWarning) self.df.columns = [ x.replace("COSMIC ID", "COSMIC_ID") for x in self.df.columns ] if "CL" in _cols and "COSMID_ID" not in self.df.columns: colorlog.warning( "'CL column name is deprecated since " + "0.9.10. Please replace with 'COSMIC_ID'", DeprecationWarning) self.df.columns = [ x.replace("CL", "COSMIC_ID") for x in self.df.columns ] # If the data has not been interpreted, COSMIC column should be # found in the column and set as the index _cols = [str(x) for x in self.df.columns] if self.cosmic_name in self.df.columns: self.df.set_index(self.cosmic_name, inplace=True) _cols = [str(x) for x in self.df.columns] if drug_prefix: columns = [x for x in _cols if x.startswith(drug_prefix)] self.df = self.df[columns] # If already interpreted, COSMIC name should be the index already. # and should be integers, so let us cast to integer elif self.df.index.name == self.cosmic_name: _cols = [str(x) for x in self.df.columns] if drug_prefix: columns = [x for x in _cols if x.startswith(drug_prefix)] columns = self.df.columns assert len(columns) == len(set(columns)) self.df = self.df[columns] # Otherwise, raise an error else: raise ValueError( "{0} column could not be found in the header".format( self.cosmic_name)) # In v18, the drug ids may be duplicated if self._v18 is True: return self.df.columns = [drug_name_to_int(x) for x in self.df.columns] self.df.columns = self.df.columns.astype(int) self.df.index = [int(x) for x in self.df.index] self.df.index = self.df.index.astype(int) self.df.index.name = "COSMIC_ID" # Check uniqueness self._check_uniqueness(self.df.index)