def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False): """ MSFT's dataset, processed by Kaggle https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk """ vocab = get_vocab(f'{infold}/atis.dict.vocab.csv') if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold)) return outfold logging.info(f'Processing ATIS dataset and storing at {outfold}.') os.makedirs(outfold, exist_ok=True) outfiles = {} for mode in modes: outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') outfiles[mode].write('sentence\tlabel\n') outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines() intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines() slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines() for i, query in enumerate(queries): sentence = ids2text(query.strip().split()[1:-1], vocab) if do_lower_case: sentence = sentence.lower() outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n') slot = ' '.join(slots[i].strip().split()[1:-1]) outfiles[mode + '_slots'].write(slot + '\n') shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv') shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv') for mode in modes: outfiles[mode].close()
def save_embeddings(self, bert_hidden_states, output_file, mode): """Generate schema element embeddings and save it as a numpy file.""" schema_embeddings = [] max_num_intent = self.schema_config["MAX_NUM_INTENT"] max_num_cat_slot = self.schema_config["MAX_NUM_CAT_SLOT"] max_num_noncat_slot = self.schema_config["MAX_NUM_NONCAT_SLOT"] max_num_slot = max_num_cat_slot + max_num_noncat_slot max_num_value = self.schema_config["MAX_NUM_VALUE_PER_CAT_SLOT"] embedding_dim = self.schema_config["EMBEDDING_DIMENSION"] for _ in self.schemas.services: schema_embeddings.append({ "intent_emb": np.zeros([max_num_intent, embedding_dim]), "req_slot_emb": np.zeros([max_num_slot, embedding_dim]), "cat_slot_emb": np.zeros([max_num_cat_slot, embedding_dim]), "noncat_slot_emb": np.zeros([max_num_noncat_slot, embedding_dim]), "cat_slot_value_emb": np.zeros([max_num_cat_slot, max_num_value, embedding_dim]), }) # Populate the embeddings based on bert inference results and save them. self._populate_schema_embeddings(schema_embeddings, bert_hidden_states, mode) master_device = not torch.distributed.is_initialized( ) or torch.distributed.get_rank() == 0 if master_device: with open(output_file, "wb") as f_s: np.save(f_s, schema_embeddings) logging.info(f"The schema embeddings saved at {output_file}") f_s.close()
def on_epoch_end(self): if self.global_rank is None or self.global_rank == 0: step = self.step delta = datetime.timedelta(seconds=(time.time() - self._last_epoch_start)) logging.info(f"Finished epoch {self.epoch_num} in {delta}")
def __init__(self, data_dir, domains={ "attraction": 0, "restaurant": 1, "taxi": 2, "train": 3, "hotel": 4 }): logging.info(f'Processing MultiWOZ dataset') self.all_domains = { 'attraction': 0, 'restaurant': 1, 'taxi': 2, 'train': 3, 'hotel': 4, 'hospital': 5, 'bus': 6, 'police': 7, } self.gating_dict = {'ptr': 0, 'dontcare': 1, 'none': 2} self.data_dir = data_dir self.domains = domains self.vocab = Vocab() ontology_file = open(f'{self.data_dir}/ontology.json', 'r') self.ontology = json.load(ontology_file) self.vocab_file = None self.slots = None self.get_slots() self.get_vocab()
def process_assistant(infold, outfold, modes=['train', 'test']): """ https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes about 25 thousand examples with 66 various multi-domain intents and 57 entity types. """ if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]): logging.info(DATABASE_EXISTS_TMP.format('robot', outfold)) return outfold logging.info( f'Processing assistant commands dataset and store at {outfold}') os.makedirs(outfold, exist_ok=True) # copy train/test files to the convenient directory to work with copy_input_files(infold) infold += "/dataset" # get list of intents from train folder (test folder supposed to be the same) intent_names = get_intents(infold + "/trainset") write_files(intent_names, f'{outfold}/dict.intents.csv') # get all train and test queries with their intent for mode in modes: intent_queries = get_intent_queries(infold, intent_names, mode) write_files(intent_queries, f'{outfold}/{mode}.tsv') # get list of all unique slots in training and testing files slot_types = get_slots(infold, modes) write_files(slot_types, f'{outfold}/dict.slots.csv') # create files of slot queries slot_dict = {k: v for v, k in enumerate(slot_types)} for mode in modes: slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names) write_files(slot_queries, f'{outfold}/{mode}_slots.tsv')
def on_action_end(self): if self.global_rank is None or self.global_rank == 0: if self._swriter is not None: self._swriter.close() delta = datetime.timedelta(seconds=(time.time() - self._start_time)) logging.info("Done in %s", delta)
def errors_per_class(cm, dict): """ Summarize confusions per each class in the confusion matrix. It can be useful both for Intents and Slots. It counts each confusion twice in both directions. Args: cm: Confusion matrix dict: Dictionary with key as a name and index as a value (Intents or Slots) """ size = cm.shape[0] confused_per_class = {} total_errors = 0 for class_num in range(size): sum = 0 for i in range(size): if i != class_num: sum += cm[class_num][i] sum += cm[i][class_num] confused_per_class[dict[class_num]] = sum total_errors += sum # logging.info(f'{dict[class_num]} - {sum}') logging.info(f'Total errors (multiplied by 2): {total_errors}') sorted_confused_per_class = sorted(confused_per_class.items(), key=lambda x: x[1], reverse=True) for conf_str in sorted_confused_per_class: logging.info(conf_str)
def trim(self, min_count): if self.trimmed: return self.trimmed = True keep_words = [] for k, v in self.word2count.items(): if v >= min_count: keep_words.append(k) logging.info("keep_words {} / {} = {:.4f}".format( len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index), )) # Reinitialize dictionaries self.word2index = {} self.word2count = {} self.index2word = { PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS", } self.num_words = 3 # Count default tokens for word in keep_words: self.addWord(word)
def eval_epochs_done_callback( global_vars, eval_data_layer, do_lower_case, n_best_size, max_answer_length, version_2_with_negative, null_score_diff_threshold, ): exact_match, f1, _ = eval_data_layer.dataset.evaluate( unique_ids=global_vars["eval_unique_ids"], start_logits=global_vars["eval_start_logits"], end_logits=global_vars["eval_end_logits"], n_best_size=n_best_size, max_answer_length=max_answer_length, version_2_with_negative=version_2_with_negative, null_score_diff_threshold=null_score_diff_threshold, do_lower_case=do_lower_case, ) logging.info(f"Exact_match = {exact_match}, f1 = {f1}") global_vars["eval_unique_ids"] = [] global_vars["eval_start_logits"] = [] global_vars["eval_end_logits"] = [] return dict({"exact_match": exact_match, "f1": f1})
def get_label_stats(labels, outfile='stats.tsv'): ''' Args: labels: list of all labels outfile: path to the file where to save label stats Returns: total (int): total number of labels label_frequencies (list of tuples): each tuple represent (label, label frequency) ''' labels = Counter(labels) total = sum(labels.values()) out = open(outfile, 'w') i = 0 freq_dict = {} label_frequencies = labels.most_common() for k, v in label_frequencies: out.write(f'{k}\t\t{round(v/total,5)}\t\t{v}\n') if i < 3: logging.info(f'{i} item: {k}, {v} out of {total}, {v / total}.') i += 1 freq_dict[k] = v return total, freq_dict, max(labels.keys())
def eval_epochs_done_callback(global_vars, punct_label_ids, capit_label_ids, graph_fold=None, normalize_cm=True): ''' Args: graph_fold (str): path to output folder normalize_cm (bool): flag to indicate whether to normalize confusion matrix ''' results = {} punct_class_report = _eval_epochs_done_callback('punct', global_vars, punct_label_ids, graph_fold, normalize_cm) for label in punct_class_report: if label != 'accuracy': label_name = label[: label.index('(label id') - 1] if 'label id' in label else label results['pF1 ' + label_name] = round(punct_class_report[label]['f1-score'] * 100, 2) results['pPR ' + label_name] = round(punct_class_report[label]['precision'] * 100, 2) results['pR ' + label_name] = round(punct_class_report[label]['recall'] * 100, 2) capit_class_report = _eval_epochs_done_callback('capit', global_vars, capit_label_ids, graph_fold, normalize_cm) for label in capit_class_report: if label != 'accuracy': label_name = label[: label.index('(label id') - 1] if 'label id' in label else label results['cF1: ' + label_name] = round(capit_class_report[label]['f1-score'] * 100, 2) results['pPR ' + label_name] = round(capit_class_report[label]['precision'] * 100, 2) results['pR ' + label_name] = round(capit_class_report[label]['recall'] * 100, 2) logging.info(f'results: {results}') return results
def write_timestamped(contents, dir=None, name=None, mode="wb"): """ Generates a timestamped file path in the specified directory. Args: contents (bytes-like object or callable): Either a bytes-like object that can be written to disk, or a callable which will return such an object. dir (str): The directory to write into. name (str): The name of the file. Optional Args: mode(str): The mode to use when writing. Defaults to "wb". Returns: str: The complete file path, or None if nothing was written. """ if dir is not None: if not os.path.exists(dir): # logging.debug("{:} does not exist, creating now.".format(dir)) os.makedirs(dir, exist_ok=True) path = timestamped_filepath(dir, name) if callable(contents): contents = contents() if os.path.exists(path): logging.warning( "{:} already exists. Will not overwrite.".format(path)) else: with open(path, mode) as f: logging.info("Writing to {:}".format(path)) f.write(contents) return path return None
def load_plugins(): import ctypes for plugin in plugins: path = os.path.abspath(plugin) logging.info("Loading plugin library: {:}".format(path)) ctypes.CDLL(path)
def process_imdb(infold, outfold, uncased, modes=['train', 'test']): if not os.path.exists(infold): link = 'www.kaggle.com/iarunava/imdb-movie-reviews-dataset' raise ValueError(f'Data not found at {infold}. ' f'Please download IMDB from {link}.') logging.info(f'Processing IMDB dataset and store at {outfold}') os.makedirs(outfold, exist_ok=True) outfiles = {} for mode in modes: outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') outfiles[mode].write('sentence\tlabel\n') for sent in ['neg', 'pos']: if sent == 'neg': label = 0 else: label = 1 files = glob.glob(f'{data_dir}/{mode}/{sent}/*.txt') for file in files: with open(file, 'r') as f: review = f.read().strip() if uncased: review = review.lower() review = review.replace("<br />", "") outfiles[mode].write(f'{review}\t{label}\n') for mode in modes: outfiles[mode].close()
def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, mode='train', is_training=True): logging.info(f"Loading {mode} data...") data_file = f'{data_desc.data_dir}/{mode}.tsv' shuffle = args.shuffle_data if is_training else False data_layer = nemo_nlp.nm.data_layers.BertTextClassificationDataLayer( input_file=data_file, tokenizer=tokenizer, max_seq_length=args.max_seq_length, num_samples=num_samples, shuffle=shuffle, batch_size=batch_size, use_cache=args.use_cache, ) ids, type_ids, input_mask, labels = data_layer() data_size = len(data_layer) if data_size < batch_size: logging.warning("Batch_size is larger than the dataset size") logging.warning("Reducing batch_size to dataset size") batch_size = data_size steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus)) hidden_states = model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask) logits = classifier(hidden_states=hidden_states) loss = loss_fn(logits=logits, labels=labels) if is_training: tensors_to_evaluate = [loss, logits]
def analyze_confusion_matrix(cm, dict, max_pairs=10): """ Sort all confusions in the confusion matrix by value and display results. Print results in a format: (name -> name, value) Args: cm: Confusion matrix dict: Dictionary with key as a name and index as a value (Intents or Slots) max_pairs: Max number of confusions to print """ threshold = 5 # just arbitrary value to take confusion with at least this number confused_pairs = {} size = cm.shape[0] for i in range(size): res = cm[i].argsort() for j in range(size): pos = res[size - j - 1] # no confusion - same row and column if pos == i: continue elif cm[i][pos] >= threshold: str = f'{dict[i]} -> {dict[pos]}' confused_pairs[str] = cm[i][pos] else: break # sort by max confusions and print first max_pairs sorted_confused_pairs = sorted(confused_pairs.items(), key=lambda x: x[1], reverse=True) for i, pair_str in enumerate(sorted_confused_pairs): if i >= max_pairs: break logging.info(pair_str)
def _added_token_counts(data_iterator, try_swapping, max_input_examples=10000): """Computes how many times different phrases have to be added. Args: data_iterator: Iterator to yield source lists and targets. See function yield_sources_and_targets in utils.py for the available iterators. The strings in the source list will be concatenated, possibly after swapping their order if swapping is enabled. try_swapping: Whether to try if swapping sources results in less added text. max_input_examples: Maximum number of examples to be read from the iterator. Returns: Tuple (collections.Counter for phrases, added phrases for each example). """ phrase_counter = collections.Counter() num_examples = 0 all_added_phrases = [] for sources, target in data_iterator: if num_examples >= max_input_examples: break if num_examples % 1000 == 0: logging.info("{} examples processed.".format(num_examples)) added_phrases = _get_added_phrases(' '.join(sources), target) if try_swapping and len(sources) == 2: added_phrases_swap = _get_added_phrases(' '.join(sources[::-1]), target) # If we can align more and have to add less after swapping, we assume that # the sources would be swapped during conversion. if len(''.join(added_phrases_swap)) < len(''.join(added_phrases)): added_phrases = added_phrases_swap for phrase in added_phrases: phrase_counter[phrase] += 1 all_added_phrases.append(added_phrases) num_examples += 1 logging.info(f'{num_examples} examples processed.\n') return phrase_counter, all_added_phrases
def process_dialogflow(infold, outfold, dev_split=0.1): if not os.path.exists(infold): link = 'www.dialogflow.com' raise ValueError(f'Data not found at {infold}. ' f'Export your dialogflow data from' f'{link} and unzip at {infold}.') if if_exist(outfold, [f'{mode}.tsv' for mode in ['train', 'test']]): logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold)) return os.makedirs(outfold, exist_ok=True) files = get_intent_query_files_dialogflow(infold) slot_labels = get_slots_dialogflow(files) intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow( files, slot_labels) train_queries, train_slots, test_queries, test_slots = partition_data( intent_queries, slot_tags, split=dev_split) write_files(train_queries, f'{outfold}/train.tsv') write_files(train_slots, f'{outfold}/train_slots.tsv') write_files(test_queries, f'{outfold}/test.tsv') write_files(test_slots, f'{outfold}/test_slots.tsv') write_files(slot_labels, f'{outfold}/dict.slots.csv') write_files(intent_names, f'{outfold}/dict.intents.csv')
def eval_epochs_done_callback(global_vars, label_ids, graph_fold=None, none_label_id=0, normalize_cm=True): labels = np.asarray(global_vars['all_labels']) preds = np.asarray(global_vars['all_preds']) subtokens_mask = np.asarray(global_vars['all_subtokens_mask']) > 0.5 labels = labels[subtokens_mask] preds = preds[subtokens_mask] # print predictions and labels for a small random subset of data sample_size = 20 i = 0 if preds.shape[0] > sample_size + 1: i = random.randint(0, preds.shape[0] - sample_size - 1) logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size])) logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size])) accuracy = sum(labels == preds) / labels.shape[0] logging.info(f'Accuracy: {accuracy}') f1_scores = get_f1_scores(labels, preds, average_modes=['weighted', 'macro', 'micro']) for k, v in f1_scores.items(): logging.info(f'{k}: {v}') classification_report = get_classification_report(labels, preds, label_ids) logging.info(classification_report) # calculate and plot confusion_matrix if graph_fold: plot_confusion_matrix(labels, preds, graph_fold, label_ids, normalize=normalize_cm) return dict({'Accuracy': accuracy})
def process_sst_2(data_dir): if not os.path.exists(data_dir): link = 'https://gluebenchmark.com/tasks' raise ValueError(f'Data not found at {data_dir}. ' f'Please download SST-2 from {link}.') logging.info('Keep in mind that SST-2 is only available in lower case.') return data_dir
def create_vocab(self): self.vocab.add_words(self.slots, 'slot') filename = f'{self.data_dir}/train_dials.json' logging.info(f'Building vocab from {filename}') dialogs = json.load(open(filename, 'r')) max_value_len = 0 for dialog_dict in dialogs: for turn in dialog_dict['dialogue']: self.vocab.add_words(turn['system_transcript'], 'utterance') self.vocab.add_words(turn['transcript'], 'utterance') turn_beliefs = fix_general_label_error_multiwoz( turn['belief_state'], self.slots) lengths = [ len(turn_beliefs[slot]) for slot in self.slots if slot in turn_beliefs ] lengths.append(max_value_len) max_value_len = max(lengths) logging.info(f'Saving vocab to {self.data_dir}') with open(self.vocab_file, 'wb') as handle: pickle.dump(self.vocab, handle)
def __call__(self): class DummyContextManager(object): def __enter__(self): return None def __exit__(self, exc_type, exc_value, traceback): return None network_parser = self.network_loader() try: network, parser = network_parser assert isinstance(network, trt.INetworkDefinition) except (ValueError, AssertionError): network = network_parser parser = DummyContextManager() with trt.Builder(TRT_LOGGER) as builder, network, parser: if self.preprocess_network: logging.debug("Applying network preprocessing: {:}".format( self.preprocess_network)) self.preprocess_network(network) if self.layerwise: TensorRTRunnerV2.mark_layerwise(network) if logging.getEffectiveLevel() <= logging.DEBUG: TensorRTRunnerV2.log_network(network) config = builder.create_builder_config() profile = TensorRTRunnerV2.build_profile(builder, network, self.profile_shapes) config.add_optimization_profile(profile) config.max_workspace_size = int(self.max_workspace_size) if self.fp16_mode: config.flags = 1 << int(trt.BuilderFlag.FP16) if self.int8_mode: config.flags = config.flags | 1 << int(trt.BuilderFlag.INT8) if not network.has_explicit_precision: if not self.calibrator: logging.critical( "Network does not have explicit precision. A calibrator must be provided in order to use int8 mode." ) self.calibrator.set_input_metadata( get_input_metadata_from_profile(profile, network)) config.int8_calibrator = self.calibrator logging.debug("Using builder configuration flags: {:}".format( config.flags)) logging.info( "Building engine: max workspace size={:} bytes, fp16={:}, int8={:}, layerwise={:}" .format(self.max_workspace_size, self.fp16_mode, self.int8_mode, self.layerwise)) engine = builder.build_engine(network, config) self.written_engine_path = write_timestamped( contents=lambda: engine.serialize(), dir=self.write_engine, name="tensorrt_runner_v2.engine") return engine
def receive_on_queue(queue, timeout=None): logging.info("Waiting for data to become available on queue") obj = queue.get(block=True, timeout=timeout) if is_compressed(obj): logging.debug("Decompressing output") obj = decompress(obj) logging.info("Received {:} on queue".format(obj)) return obj
def on_iteration_start(self): if self.step == 4: profiler.start() logging.info(f"********************Starting profiler at step: " + str(self.step)) if self.global_rank is None or self.global_rank == 0: self._last_iter_start = time.time()
def readVocs(datafile, corpus_name): logging.info("Reading lines...") # Read the file and split into lines lines = open(datafile, encoding="utf-8").read().strip().split("\n") # Split every line into pairs and normalize pairs = [[normalizeString(s) for s in l.split("\t")] for l in lines] voc = Voc(corpus_name) return voc, pairs
def eval_epochs_done_callback(global_vars): eloss = mean(global_vars["eval_loss"]) etop1 = mean(global_vars["top1"]) logging.info("Evaluation Loss: {0}".format(eloss)) logging.info("Evaluation Top@1: {0}".format(etop1)) for k in global_vars.keys(): global_vars[k] = [] return dict({"Evaluation Loss": eloss, "Evaluation Top@1": etop1})
def _eval_epochs_done_callback(task_name, global_vars, label_ids, graph_fold=None, normalize_cm=True): labels = np.array(global_vars[task_name + '_labels']) preds = np.array(global_vars[task_name + '_preds']) # calculate and plot confusion_matrix if graph_fold: plot_confusion_matrix(labels, preds, graph_fold, label_ids, normalize=normalize_cm, prefix=task_name) logging.info(f'{get_classification_report(labels, preds, label_ids)}') return get_classification_report(labels, preds, label_ids, output_dict=True)
def get_vocab(self): self.vocab_file = f'{self.data_dir}/vocab.pkl' if os.path.exists(self.vocab_file): logging.info(f'Loading vocab from {self.data_dir}') self.vocab = pickle.load(open(self.vocab_file, 'rb')) else: self.create_vocab() logging.info(f'Vocab size {len(self.vocab)}')
def forward(self, mel_spectrogram): if not self._removed_weight_norm: logging.info("remove WN") self.waveglow = self.waveglow.remove_weightnorm(self.waveglow) self._removed_weight_norm = True if self.training: raise ValueError("You are using the WaveGlow Infer Neural Module in training mode.") with torch.no_grad(): audio = self.waveglow.infer(mel_spectrogram, sigma=self._sigma) return audio
def write_predictions_to_file(predictions, input_json_files, output_dir, schemas, state_tracker, eval_debug, in_domain_services): """Write the predicted dialogues as json files. Args: predictions: An iterator containing model predictions. This is the output of the predict method in the estimator. input_json_files: A list of json paths containing the dialogues to run inference on. schemas: Schemas to all services in the dst dataset (train, dev and test splits). output_dir: The directory where output json files will be created. """ logging.info(f"Writing predictions to {output_dir} started.") # Index all predictions. all_predictions = {} for idx, prediction in enumerate(predictions): if not prediction["is_real_example"]: continue eval_dataset, dialog_id, turn_id, service_name = prediction[ 'example_id'].split('-') all_predictions[(dialog_id, turn_id, service_name)] = prediction logging.info( f'Predictions for {idx} examples in {eval_dataset} dataset are getting processed.' ) # Read each input file and write its predictions. for input_file_path in input_json_files: with open(input_file_path) as f: dialogs = json.load(f) logging.debug(f'{input_file_path} file is loaded') pred_dialogs = [] for d in dialogs: if state_tracker == 'baseline': pred_dialog = get_predicted_dialog_baseline( d, all_predictions, schemas) elif state_tracker == 'nemotracker': pred_dialog = get_predicted_dialog_nemotracker( d, all_predictions, schemas, eval_debug, in_domain_services) else: raise ValueError( f"tracker_mode {state_tracker} is not defined.") pred_dialogs.append(pred_dialog) f.close() input_file_name = os.path.basename(input_file_path) output_file_path = os.path.join(output_dir, input_file_name) with open(output_file_path, "w") as f: json.dump(pred_dialogs, f, indent=2, separators=(",", ": "), sort_keys=True) f.close()