def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False): """ MSFT's dataset, processed by Kaggle https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk """ vocab = get_vocab(f'{infold}/atis.dict.vocab.csv') if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold)) return outfold logging.info(f'Processing ATIS dataset and storing at {outfold}.') os.makedirs(outfold, exist_ok=True) outfiles = {} for mode in modes: outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') outfiles[mode].write('sentence\tlabel\n') outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines() intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines() slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines() for i, query in enumerate(queries): sentence = ids2text(query.strip().split()[1:-1], vocab) if do_lower_case: sentence = sentence.lower() outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n') slot = ' '.join(slots[i].strip().split()[1:-1]) outfiles[mode + '_slots'].write(slot + '\n') shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv') shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv') for mode in modes: outfiles[mode].close()
def process_assistant(infold, outfold, modes=['train', 'test']): """ https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes about 25 thousand examples with 66 various multi-domain intents and 57 entity types. """ if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]): logging.info(DATABASE_EXISTS_TMP.format('robot', outfold)) return outfold logging.info(f'Processing assistant commands dataset and store at {outfold}') os.makedirs(outfold, exist_ok=True) # copy train/test files to the convenient directory to work with copy_input_files(infold) infold += "/dataset" # get list of intents from train folder (test folder supposed to be the same) intent_names = get_intents(infold + "/trainset") write_files(intent_names, f'{outfold}/dict.intents.csv') # get all train and test queries with their intent for mode in modes: intent_queries = get_intent_queries(infold, intent_names, mode) write_files(intent_queries, f'{outfold}/{mode}.tsv') # get list of all unique slots in training and testing files slot_types = get_slots(infold, modes) write_files(slot_types, f'{outfold}/dict.slots.csv') # create files of slot queries slot_dict = {k: v for v, k in enumerate(slot_types)} for mode in modes: slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names) write_files(slot_queries, f'{outfold}/{mode}_slots.tsv')
def process_text2sparql(infold: str, outfold: str, do_lower_case: bool): """ Process and convert MeetKai's text2sparql datasets to NeMo's neural machine translation format. Args: infold: directory path to raw text2sparql data containing train.tsv, test_easy.tsv, test_hard.tsv outfold: output directory path to save formatted data for NeuralMachineTranslationDataset the first line is header (sentence [tab] label) each line should be [sentence][tab][label] do_lower_case: if true, convert all sentences and labels to lower """ logging.info(f"Processing Text2Sparql dataset and storing at: {outfold}") os.makedirs(outfold, exist_ok=True) dataset_name = "Text2Sparql" for prefix in prefix_map: input_file = os.path.join(infold, prefix) output_file = os.path.join(outfold, prefix_map[prefix]) if if_exist(outfold, [prefix_map[prefix]]): logging.info(f"** {MODE_EXISTS_TMP.format(prefix_map[prefix], dataset_name, output_file)}") continue if not if_exist(infold, [prefix]): logging.info(f"** {prefix} of {dataset_name}" f" is skipped as it was not found") continue assert input_file != output_file, "input file cannot equal output file" with open(input_file, "r") as in_file: with open(output_file, "w") as out_file: reader = csv.reader(in_file, delimiter="\t") # replace headers out_file.write("sentence\tlabel\n") next(reader) for line in reader: sentence = line[0] label = line[1] if do_lower_case: sentence = sentence.lower() label = label.lower() out_file.write(f"{sentence}\t{label}\n")
def process_snips(infold, outfold, do_lower_case, modes=['train', 'test'], dev_split=0.1): if not os.path.exists(infold): link = 'https://github.com/snipsco/spoken-language-understanding-research-datasets' raise ValueError( f'Data not found at {infold}. ' f'You may request to download the SNIPS dataset from {link}.') exist = True for dataset in ['light', 'speak', 'all']: if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]): logging.info( DATABASE_EXISTS_TMP.format('SNIPS-' + dataset, outfold)) else: exist = False if exist: return outfold logging.info( f'Processing SNIPS dataset and storing at folders "speak", "light" and "all" under {outfold}.' ) logging.info( f'Processing and importing "smart-speaker-en-close-field" -> "speak" and "smart-speaker-en-close-field" -> "light".' ) os.makedirs(outfold, exist_ok=True) speak_dir = 'smart-speaker-en-close-field' light_dir = 'smart-lights-en-close-field' light_files = [f'{infold}/{light_dir}/dataset.json'] speak_files = [f'{infold}/{speak_dir}/training_dataset.json'] speak_files.append(f'{infold}/{speak_dir}/test_dataset.json') light_train, light_dev, light_slots, light_intents = get_dataset( light_files, dev_split) speak_train, speak_dev, speak_slots, speak_intents = get_dataset( speak_files) create_dataset(light_train, light_dev, light_slots, light_intents, do_lower_case, f'{outfold}/light') create_dataset(speak_train, speak_dev, speak_slots, speak_intents, do_lower_case, f'{outfold}/speak') create_dataset( light_train + speak_train, light_dev + speak_dev, light_slots | speak_slots, light_intents | speak_intents, do_lower_case, f'{outfold}/all', )
def download_text2sparql(infold: str): """Downloads text2sparql train, test_easy, and test_hard data Args: infold: save directory path """ os.makedirs(infold, exist_ok=True) for prefix in prefix_map: url = base_url + prefix logging.info(f"Downloading: {url}") if if_exist(infold, [prefix]): logging.info("** Download file already exists, skipping download") else: req = Request(url, headers={"User-Agent": "Mozilla/5.0"}) with open(os.path.join(infold, prefix), "wb") as handle: handle.write(urlopen(req, timeout=20).read())
def process_jarvis_datasets( infold, outfold, modes=['train', 'test', 'dev'], do_lower_case=False, ignore_prev_intent=False ): """ process and convert Jarvis datasets into NeMo's BIO format """ dataset_name = "jarvis" if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']): logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold)) return outfold logging.info(f'Processing {dataset_name} dataset and storing at {outfold}') os.makedirs(outfold, exist_ok=True) outfiles = {} intents_list = {} slots_list = {} slots_list_all = {} outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w') outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w') outfiles['dict_slots'].write('O\n') slots_list["O"] = 0 slots_list_all["O"] = 0 for mode in modes: if if_exist(outfold, [f'{mode}.tsv']): logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode)) continue if not if_exist(infold, [f'{mode}.tsv']): logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.') continue outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') outfiles[mode].write('sentence\tlabel\n') outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') queries = open(f'{infold}/{mode}.tsv', 'r').readlines() for i, query in enumerate(queries): line_splits = query.strip().split("\t") if len(line_splits) == 3: intent_str, slot_tags_str, sentence = line_splits else: intent_str, sentence = line_splits slot_tags_str = "" if intent_str not in intents_list: intents_list[intent_str] = len(intents_list) outfiles['dict_intents'].write(f'{intent_str}\n') if ignore_prev_intent: start_token = 2 else: start_token = 1 if do_lower_case: sentence = sentence.lower() sentence_cld = " ".join(sentence.strip().split()[start_token:-1]) outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n') slot_tags_list = [] if slot_tags_str.strip(): slot_tags = slot_tags_str.strip().split(",") for st in slot_tags: if not st.strip(): continue [start_i, end_i, slot_name] = st.strip().split(":") slot_tags_list.append([int(start_i), int(end_i), slot_name]) if slot_name not in slots_list: slots_list[slot_name] = len(slots_list) slots_list_all[f'B-{slot_name}'] = len(slots_list_all) slots_list_all[f'I-{slot_name}'] = len(slots_list_all) outfiles['dict_slots'].write(f'B-{slot_name}\n') outfiles['dict_slots'].write(f'I-{slot_name}\n') slot_tags_list.sort(key=lambda x: x[0]) slots = [] processed_index = 0 for tag_start, tag_end, tag_str in slot_tags_list: if tag_start > processed_index: words_list = sentence[processed_index:tag_start].strip().split() slots.extend([str(slots_list_all['O'])] * len(words_list)) words_list = sentence[tag_start:tag_end].strip().split() slots.append(str(slots_list_all[f'B-{tag_str}'])) slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1)) processed_index = tag_end if processed_index < len(sentence): words_list = sentence[processed_index:].strip().split() slots.extend([str(slots_list_all['O'])] * len(words_list)) slots = slots[1:-1] slot = ' '.join(slots) outfiles[mode + '_slots'].write(slot + '\n') outfiles[mode + '_slots'].close() outfiles[mode].close() outfiles['dict_slots'].close() outfiles['dict_intents'].close() return outfold
def __init__( self, data_dir: str, modes: List[str] = ['train', 'test', 'dev'], none_slot_label: str = 'O', pad_label: int = -1, ): if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): raise FileNotFoundError( "Make sure that your data follows the standard format " "supported by JointIntentSlotDataset. Your data must " "contain dict.intents.csv and dict.slots.csv.") self.data_dir = data_dir self.intent_dict_file = self.data_dir + '/dict.intents.csv' self.slot_dict_file = self.data_dir + '/dict.slots.csv' self.intents_label_ids = IntentSlotDataDesc.label2idx( self.intent_dict_file) self.num_intents = len(self.intents_label_ids) self.slots_label_ids = IntentSlotDataDesc.label2idx( self.slot_dict_file) self.num_slots = len(self.slots_label_ids) infold = self.data_dir for mode in modes: if not if_exist(self.data_dir, [f'{mode}.tsv']): logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') continue logging.info(f' Stats calculating for {mode} mode...') slot_file = f'{self.data_dir}/{mode}_slots.tsv' with open(slot_file, 'r') as f: slot_lines = f.readlines() input_file = f'{self.data_dir}/{mode}.tsv' with open(input_file, 'r') as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 if len(slot_lines) != len(input_lines): raise ValueError( "Make sure that the number of slot lines match the " "number of intent lines. There should be a 1-1 " "correspondence between every slot and intent lines.") dataset = list(zip(slot_lines, input_lines)) raw_slots, raw_intents = [], [] for slot_line, input_line in dataset: slot_list = [int(slot) for slot in slot_line.strip().split()] raw_slots.append(slot_list) parts = input_line.strip().split() raw_intents.append(int(parts[-1])) logging.info(f'Three most popular intents in {mode} mode:') total_intents, intent_label_freq, max_id = get_label_stats( raw_intents, infold + f'/{mode}_intent_stats.tsv') merged_slots = itertools.chain.from_iterable(raw_slots) logging.info(f'Three most popular slots in {mode} mode:') slots_total, slots_label_freq, max_id = get_label_stats( merged_slots, infold + f'/{mode}_slot_stats.tsv') logging.info(f'Total Number of Intents: {total_intents}') logging.info(f'Intent Label Frequencies: {intent_label_freq}') logging.info(f'Total Number of Slots: {slots_total}') logging.info(f'Slots Label Frequencies: {slots_label_freq}') if mode == 'train': intent_weights_dict = get_freq_weights(intent_label_freq) logging.info(f'Intent Weights: {intent_weights_dict}') slot_weights_dict = get_freq_weights(slots_label_freq) logging.info(f'Slot Weights: {slot_weights_dict}') self.intent_weights = fill_class_weights(intent_weights_dict, self.num_intents - 1) self.slot_weights = fill_class_weights(slot_weights_dict, self.num_slots - 1) if pad_label != -1: self.pad_label = pad_label else: if none_slot_label not in self.slots_label_ids: raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') self.pad_label = self.slots_label_ids[none_slot_label]
def __init__( self, data_dir: str, modes: List[str] = ["train", "test", "dev"], none_slot_label: str = "O", pad_label: int = -1, ): if not if_exist(data_dir, ["dict.intents.csv", "dict.slots.csv"]): raise FileNotFoundError( "Make sure that your data follows the standard format " "supported by MultiLabelIntentSlotDataset. Your data must " "contain dict.intents.csv and dict.slots.csv.") self.data_dir = data_dir self.intent_dict_file = self.data_dir + "/dict.intents.csv" self.slot_dict_file = self.data_dir + "/dict.slots.csv" self.intents_label_ids = get_labels_to_labels_id_mapping( self.intent_dict_file) self.num_intents = len(self.intents_label_ids) self.slots_label_ids = get_labels_to_labels_id_mapping( self.slot_dict_file) self.num_slots = len(self.slots_label_ids) infold = self.data_dir for mode in modes: if not if_exist(self.data_dir, [f"{mode}.tsv"]): logging.info(f" Stats calculation for {mode} mode" f" is skipped as {mode}.tsv was not found.") continue logging.info(f" Stats calculating for {mode} mode...") slot_file = f"{self.data_dir}/{mode}_slots.tsv" with open(slot_file, "r") as f: slot_lines = f.readlines() input_file = f"{self.data_dir}/{mode}.tsv" with open(input_file, "r") as f: input_lines = f.readlines()[1:] # Skipping headers at index 0 if len(slot_lines) != len(input_lines): raise ValueError( "Make sure that the number of slot lines match the " "number of intent lines. There should be a 1-1 " "correspondence between every slot and intent lines.") dataset = list(zip(slot_lines, input_lines)) raw_slots, raw_intents = [], [] for slot_line, input_line in dataset: slot_list = [int(slot) for slot in slot_line.strip().split()] raw_slots.append(slot_list) parts = input_line.strip().split("\t")[1:][0] parts = list(map(int, parts.split(","))) parts = [ 1 if label in parts else 0 for label in range(self.num_intents) ] raw_intents.append(tuple(parts)) logging.info(f"Three most popular intents in {mode} mode:") total_intents, intent_label_freq, max_id = get_multi_label_stats( raw_intents, infold + f"/{mode}_intent_stats.tsv") merged_slots = itertools.chain.from_iterable(raw_slots) logging.info(f"Three most popular slots in {mode} mode:") slots_total, slots_label_freq, max_id = get_label_stats( merged_slots, infold + f"/{mode}_slot_stats.tsv") logging.info(f"Total Number of Intent Labels: {total_intents}") logging.info(f"Intent Label Frequencies: {intent_label_freq}") logging.info(f"Total Number of Slots: {slots_total}") logging.info(f"Slots Label Frequencies: {slots_label_freq}") if mode == "train": intent_weights_dict = get_freq_weights_bce_with_logits_loss( intent_label_freq) logging.info(f"Intent Weights: {intent_weights_dict}") slot_weights_dict = get_freq_weights(slots_label_freq) logging.info(f"Slot Weights: {slot_weights_dict}") self.intent_weights = fill_class_weights(intent_weights_dict, self.num_intents - 1) self.slot_weights = fill_class_weights(slot_weights_dict, self.num_slots - 1) if pad_label != -1: self.pad_label = pad_label else: if none_slot_label not in self.slots_label_ids: raise ValueError(f"none_slot_label {none_slot_label} not " f"found in {self.slot_dict_file}.") self.pad_label = self.slots_label_ids[none_slot_label]