def __init__(self, data_dir, modes=['train', 'test', 'dev']): self.data_dir = data_dir max_label = 0 for mode in modes: if not if_exist(self.data_dir, [f'{mode}.tsv']): logging.info( f'Stats calculation for {mode} mode is skipped as {mode}.tsv was not found.' ) continue input_file = f'{self.data_dir}/{mode}.tsv' with open(input_file, 'r') as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 try: int(input_lines[0].strip().split()[-1]) except ValueError: logging.warning(f'No numerical labels found for {mode}.tsv.') raise queries, raw_sentences = [], [] for input_line in input_lines: parts = input_line.strip().split() label = int(parts[-1]) if label > max_label: max_label = label raw_sentences.append(label) queries.append(' '.join(parts[:-1])) infold = input_file[:input_file.rfind('/')] logging.info(f'Three most popular classes in {mode} dataset') total_sents, sent_label_freq = get_label_stats( raw_sentences, infold + f'/{mode}_sentence_stats.tsv') if mode == 'train': self.class_weights = calc_class_weights(sent_label_freq) logging.info(f'Class weights are - {self.class_weights}') logging.info(f'Total Sentences - {total_sents}') logging.info(f'Sentence class frequencies - {sent_label_freq}') self.num_labels = max_label + 1
# Define classifier for Punctuation and Capitalization tasks punct_classifier = TokenClassifier(hidden_size=bert_model.hidden_size, num_classes=len(punct_label_ids), dropout=CLASSIFICATION_DROPOUT, num_layers=PUNCT_NUM_FC_LAYERS, name='Punctuation') capit_classifier = TokenClassifier(hidden_size=bert_model.hidden_size, num_classes=len(capit_label_ids), dropout=CLASSIFICATION_DROPOUT, name='Capitalization') # If you don't want to use weighted loss for Punctuation task, use class_weights=None punct_label_freqs = train_data_layer.dataset.punct_label_frequencies class_weights = calc_class_weights(punct_label_freqs) # define loss punct_loss = CrossEntropyLossNM(logits_ndim=3, weight=class_weights) capit_loss = CrossEntropyLossNM(logits_ndim=3) task_loss = LossAggregatorNM(num_inputs=2) input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, punct_labels, capit_labels = train_data_layer( ) hidden_states = bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) punct_logits = punct_classifier(hidden_states=hidden_states) capit_logits = capit_classifier(hidden_states=hidden_states)
def create_pipeline( pad_label=args.none_label, max_seq_length=args.max_seq_length, batch_size=args.batch_size, num_gpus=args.num_gpus, mode='train', punct_label_ids=None, capit_label_ids=None, ignore_extra_tokens=args.ignore_extra_tokens, ignore_start_end=args.ignore_start_end, overwrite_processed_files=args.overwrite_processed_files, dropout=args.fc_dropout, punct_num_layers=args.punct_num_fc_layers, capit_num_layers=args.capit_num_fc_layers, classifier=PunctCapitTokenClassifier, ): logging.info(f"Loading {mode} data...") shuffle = args.shuffle_data if mode == 'train' else False text_file = f'{args.data_dir}/text_{mode}.txt' label_file = f'{args.data_dir}/labels_{mode}.txt' if not (os.path.exists(text_file) or (os.path.exists(label_file))): raise FileNotFoundError(f'{text_file} or {label_file} not found. \ The data should be splitted into 2 files: text.txt and labels.txt. \ Each line of the text.txt file contains text sequences, where words\ are separated with spaces. The labels.txt file contains \ corresponding labels for each word in text.txt, the labels are \ separated with spaces. Each line of the files should follow the \ format: \ [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \ [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).') data_layer = PunctuationCapitalizationDataLayer( tokenizer=tokenizer, text_file=text_file, label_file=label_file, pad_label=pad_label, punct_label_ids=punct_label_ids, capit_label_ids=capit_label_ids, max_seq_length=max_seq_length, batch_size=batch_size, shuffle=shuffle, ignore_extra_tokens=ignore_extra_tokens, ignore_start_end=ignore_start_end, overwrite_processed_files=overwrite_processed_files, num_workers=args.num_workers, pin_memory=args.enable_pin_memory, ) (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, punct_labels, capit_labels) = data_layer() if mode == 'train': punct_label_ids = data_layer.dataset.punct_label_ids capit_label_ids = data_layer.dataset.capit_label_ids class_weights = None if args.use_weighted_loss_punct: logging.info(f"Using weighted loss for punctuation task") punct_label_freqs = data_layer.dataset.punct_label_frequencies class_weights = calc_class_weights(punct_label_freqs) classifier = classifier( hidden_size=hidden_size, punct_num_classes=len(punct_label_ids), capit_num_classes=len(capit_label_ids), dropout=dropout, punct_num_layers=punct_num_layers, capit_num_layers=capit_num_layers, ) punct_loss = CrossEntropyLossNM(logits_ndim=3, weight=class_weights) capit_loss = CrossEntropyLossNM(logits_ndim=3) task_loss = LossAggregatorNM( num_inputs=2, weights=[args.punct_loss_weight, 1.0 - args.punct_loss_weight]) hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) punct_logits, capit_logits = classifier(hidden_states=hidden_states) if mode == 'train': punct_loss = punct_loss(logits=punct_logits, labels=punct_labels, loss_mask=loss_mask) capit_loss = capit_loss(logits=capit_logits, labels=capit_labels, loss_mask=loss_mask) task_loss = task_loss(loss_1=punct_loss, loss_2=capit_loss) steps_per_epoch = len(data_layer) // (batch_size * num_gpus) losses = [task_loss, punct_loss, capit_loss] logits = [punct_logits, capit_logits] return losses, logits, steps_per_epoch, punct_label_ids, capit_label_ids, classifier else: tensors_to_evaluate = [ punct_logits, capit_logits, punct_labels, capit_labels, subtokens_mask ] return tensors_to_evaluate, data_layer
def __init__(self, data_dir, none_slot_label='O', pad_label=-1): if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): raise FileNotFoundError( "Make sure that your data follows the standard format " "supported by JointIntentSlotDataset. Your data must " "contain dict.intents.csv and dict.slots.csv.") self.data_dir = data_dir self.intent_dict_file = self.data_dir + '/dict.intents.csv' self.slot_dict_file = self.data_dir + '/dict.slots.csv' self.intents_label_ids = JointIntentSlotDataDesc.label2idx( self.intent_dict_file) self.num_intents = len(self.intents_label_ids) self.slots_label_ids = JointIntentSlotDataDesc.label2idx( self.slot_dict_file) self.num_slots = len(self.slots_label_ids) for mode in ['train', 'test', 'dev']: if not if_exist(self.data_dir, [f'{mode}.tsv']): logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') continue logging.info(f' Stats calculating for {mode} mode...') slot_file = f'{self.data_dir}/{mode}_slots.tsv' with open(slot_file, 'r') as f: slot_lines = f.readlines() input_file = f'{self.data_dir}/{mode}.tsv' with open(input_file, 'r') as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 if len(slot_lines) != len(input_lines): raise ValueError( "Make sure that the number of slot lines match the " "number of intent lines. There should be a 1-1 " "correspondence between every slot and intent lines.") dataset = list(zip(slot_lines, input_lines)) raw_slots, queries, raw_intents = [], [], [] for slot_line, input_line in dataset: slot_list = [int(slot) for slot in slot_line.strip().split()] raw_slots.append(slot_list) parts = input_line.strip().split() raw_intents.append(int(parts[-1])) queries.append(' '.join(parts[:-1])) infold = self.data_dir logging.info(f'Three most popular intents in {mode} mode:') total_intents, intent_label_freq = get_label_stats( raw_intents, infold + f'/{mode}_intent_stats.tsv') merged_slots = itertools.chain.from_iterable(raw_slots) logging.info(f'Three most popular slots in {mode} mode:') slots_total, slots_label_freq = get_label_stats( merged_slots, infold + f'/{mode}_slot_stats.tsv') if mode == 'train': self.slot_weights = calc_class_weights(slots_label_freq) logging.info(f'Slot weights are - {self.slot_weights}') self.intent_weights = calc_class_weights(intent_label_freq) logging.info(f'Intent weights are - {self.intent_weights}') logging.info(f'Total intents - {total_intents}') logging.info(f'Intent label frequency - {intent_label_freq}') logging.info(f'Total Slots - {slots_total}') logging.info(f'Slots label frequency - {slots_label_freq}') if pad_label != -1: self.pad_label = pad_label else: if none_slot_label not in self.slots_label_ids: raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') self.pad_label = self.slots_label_ids[none_slot_label]
def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1): if dataset_name == 'atis': self.data_dir = process_atis(data_dir, do_lower_case) elif dataset_name == 'snips-atis': self.data_dir, self.pad_label = self.merge(data_dir, [ 'ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all' ], dataset_name) elif dataset_name == 'dialogflow': self.data_dir = process_dialogflow(data_dir, do_lower_case) elif dataset_name == 'mturk-processed': self.data_dir = process_mturk(data_dir, do_lower_case) elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']): self.data_dir = process_snips(data_dir, do_lower_case) if dataset_name.endswith('light'): self.data_dir = f'{self.data_dir}/light' elif dataset_name.endswith('speak'): self.data_dir = f'{self.data_dir}/speak' elif dataset_name.endswith('all'): self.data_dir = f'{self.data_dir}/all' elif dataset_name.startswith('jarvis'): self.data_dir = process_jarvis_datasets( data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False) else: if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): raise FileNotFoundError( "Make sure that your data follows the standard format " "supported by JointIntentSlotDataset. Your data must " "contain dict.intents.csv and dict.slots.csv.") self.data_dir = data_dir self.intent_dict_file = self.data_dir + '/dict.intents.csv' self.slot_dict_file = self.data_dir + '/dict.slots.csv' self.num_intents = len(get_vocab(self.intent_dict_file)) slots = label2idx(self.slot_dict_file) self.num_slots = len(slots) for mode in ['train', 'test', 'eval']: if not if_exist(self.data_dir, [f'{mode}.tsv']): logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') continue slot_file = f'{self.data_dir}/{mode}_slots.tsv' with open(slot_file, 'r') as f: slot_lines = f.readlines() input_file = f'{self.data_dir}/{mode}.tsv' with open(input_file, 'r') as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 if len(slot_lines) != len(input_lines): raise ValueError( "Make sure that the number of slot lines match the " "number of intent lines. There should be a 1-1 " "correspondence between every slot and intent lines.") dataset = list(zip(slot_lines, input_lines)) raw_slots, queries, raw_intents = [], [], [] for slot_line, input_line in dataset: slot_list = [int(slot) for slot in slot_line.strip().split()] raw_slots.append(slot_list) parts = input_line.strip().split() raw_intents.append(int(parts[-1])) queries.append(' '.join(parts[:-1])) infold = input_file[:input_file.rfind('/')] logging.info(f'Three most popular intents during {mode}ing') total_intents, intent_label_freq = get_label_stats( raw_intents, infold + f'/{mode}_intent_stats.tsv') merged_slots = itertools.chain.from_iterable(raw_slots) logging.info(f'Three most popular slots during {mode}ing') slots_total, slots_label_freq = get_label_stats( merged_slots, infold + f'/{mode}_slot_stats.tsv') if mode == 'train': self.slot_weights = calc_class_weights(slots_label_freq) logging.info(f'Slot weights are - {self.slot_weights}') self.intent_weights = calc_class_weights(intent_label_freq) logging.info(f'Intent weights are - {self.intent_weights}') logging.info(f'Total intents - {total_intents}') logging.info(f'Intent label frequency - {intent_label_freq}') logging.info(f'Total Slots - {slots_total}') logging.info(f'Slots label frequency - {slots_label_freq}') if pad_label != -1: self.pad_label = pad_label else: if none_slot_label not in slots: raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') self.pad_label = slots[none_slot_label]
def __init__(self, dataset_name, data_dir, do_lower_case, modes=['train', 'test', 'eval']): if dataset_name == 'sst-2': self.data_dir = process_sst_2(data_dir) self.num_labels = 2 self.eval_file = self.data_dir + '/dev.tsv' elif dataset_name == 'imdb': self.num_labels = 2 self.data_dir = process_imdb(data_dir, do_lower_case) self.eval_file = self.data_dir + '/test.tsv' elif dataset_name == 'thucnews': self.num_labels = 14 self.data_dir = process_thucnews(data_dir) self.eval_file = self.data_dir + '/test.tsv' elif dataset_name.startswith('nlu-'): if dataset_name.endswith('chat'): self.data_dir = f'{data_dir}/ChatbotCorpus.json' self.num_labels = 2 elif dataset_name.endswith('ubuntu'): self.data_dir = f'{data_dir}/AskUbuntuCorpus.json' self.num_labels = 5 elif dataset_name.endswith('web'): data_dir = f'{data_dir}/WebApplicationsCorpus.json' self.num_labels = 8 self.data_dir = process_nlu(data_dir, do_lower_case, dataset_name=dataset_name) self.eval_file = self.data_dir + '/test.tsv' elif dataset_name.startswith('jarvis'): self.data_dir = process_jarvis_datasets( data_dir, do_lower_case, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False ) intents = get_intent_labels(f'{self.data_dir}/dict.intents.csv') self.num_labels = len(intents) elif dataset_name != 'default_format': raise ValueError( "Looks like you passed a dataset name that isn't " + "already supported by NeMo. Please make sure " + "that you build the preprocessing method for it. " + "default_format assumes that a data file has a header and each line of the file follows " + "the format: text [TAB] label. Label is assumed to be an integer." ) self.train_file = self.data_dir + '/train.tsv' for mode in modes: if not if_exist(self.data_dir, [f'{mode}.tsv']): logging.info(f'Stats calculation for {mode} mode is skipped as {mode}.tsv was not found.') continue input_file = f'{self.data_dir}/{mode}.tsv' with open(input_file, 'r') as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 try: int(input_lines[0].strip().split()[-1]) except ValueError: logging.warning(f'No numerical labels found for {mode}.tsv in {dataset_name} dataset.') raise queries, raw_sentences = [], [] for input_line in input_lines: parts = input_line.strip().split() raw_sentences.append(int(parts[-1])) queries.append(' '.join(parts[:-1])) infold = input_file[: input_file.rfind('/')] logging.info(f'Three most popular classes in {mode} dataset') total_sents, sent_label_freq = get_label_stats(raw_sentences, infold + f'/{mode}_sentence_stats.tsv') if mode == 'train': self.class_weights = calc_class_weights(sent_label_freq) logging.info(f'Class weights are - {self.class_weights}') logging.info(f'Total Sentences - {total_sents}') logging.info(f'Sentence class frequencies - {sent_label_freq}')