Python get_vocab Examples, nemo.collections.nlp.utils.get_vocab Python Examples

Example #1

0

Show file

def read_intent_slot_outputs(queries,
                             intent_file,
                             slot_file,
                             intent_logits,
                             slot_logits,
                             slot_masks,
                             intents=None,
                             slots=None):
    intent_dict = get_vocab(intent_file)
    slot_dict = get_vocab(slot_file)
    pred_intents = np.argmax(intent_logits, 1)
    pred_slots = np.argmax(slot_logits, axis=2)
    slot_masks = slot_masks > 0.5
    for i, query in enumerate(queries):
        logging.info(f'Query: {query}')
        pred = pred_intents[i]
        logging.info(f'Predicted intent:\t{pred}\t{intent_dict[pred]}')
        if intents is not None:
            logging.info(
                f'True intent:\t{intents[i]}\t{intent_dict[intents[i]]}')

        pred_slot = pred_slots[i][slot_masks[i]]
        tokens = query.strip().split()

        if len(pred_slot) != len(tokens):
            raise ValueError('Pred_slot and tokens must be of the same length')

        for j, token in enumerate(tokens):
            output = f'{token}\t{slot_dict[pred_slot[j]]}'
            if slots is not None:
                output = f'{output}\t{slot_dict[slots[i][j]]}'
            logging.info(output)

Example #2

0

Show file

def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False):
    """ MSFT's dataset, processed by Kaggle
    https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk
    """
    vocab = get_vocab(f'{infold}/atis.dict.vocab.csv')

    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold))
        return outfold
    logging.info(f'Processing ATIS dataset and storing at {outfold}.')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}
    for mode in modes:
        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')
        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines()
        intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines()
        slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines()

        for i, query in enumerate(queries):
            sentence = ids2text(query.strip().split()[1:-1], vocab)
            if do_lower_case:
                sentence = sentence.lower()
            outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n')
            slot = ' '.join(slots[i].strip().split()[1:-1])
            outfiles[mode + '_slots'].write(slot + '\n')

    shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv')
    shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv')
    for mode in modes:
        outfiles[mode].close()

Example #3

0

Show file

    def __init__(self,
                 data_dir,
                 do_lower_case=False,
                 dataset_name='default',
                 none_slot_label='O',
                 pad_label=-1):
        if dataset_name == 'atis':
            self.data_dir = process_atis(data_dir, do_lower_case)
        elif dataset_name == 'snips-atis':
            self.data_dir, self.pad_label = self.merge(data_dir, [
                'ATIS/nemo-processed-uncased',
                'snips/nemo-processed-uncased/all'
            ], dataset_name)
        elif dataset_name == 'dialogflow':
            self.data_dir = process_dialogflow(data_dir, do_lower_case)
        elif dataset_name == 'mturk-processed':
            self.data_dir = process_mturk(data_dir, do_lower_case)
        elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']):
            self.data_dir = process_snips(data_dir, do_lower_case)
            if dataset_name.endswith('light'):
                self.data_dir = f'{self.data_dir}/light'
            elif dataset_name.endswith('speak'):
                self.data_dir = f'{self.data_dir}/speak'
            elif dataset_name.endswith('all'):
                self.data_dir = f'{self.data_dir}/all'
        elif dataset_name.startswith('jarvis'):
            self.data_dir = process_jarvis_datasets(
                data_dir,
                do_lower_case,
                dataset_name,
                modes=["train", "test", "eval"],
                ignore_prev_intent=False)
        else:
            if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
                raise FileNotFoundError(
                    "Make sure that your data follows the standard format "
                    "supported by JointIntentSlotDataset. Your data must "
                    "contain dict.intents.csv and dict.slots.csv.")
            self.data_dir = data_dir

        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
        self.slot_dict_file = self.data_dir + '/dict.slots.csv'
        self.num_intents = len(get_vocab(self.intent_dict_file))
        slots = label2idx(self.slot_dict_file)
        self.num_slots = len(slots)

        for mode in ['train', 'test', 'eval']:

            if not if_exist(self.data_dir, [f'{mode}.tsv']):
                logging.info(f' Stats calculation for {mode} mode'
                             f' is skipped as {mode}.tsv was not found.')
                continue

            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
            with open(slot_file, 'r') as f:
                slot_lines = f.readlines()

            input_file = f'{self.data_dir}/{mode}.tsv'
            with open(input_file, 'r') as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            if len(slot_lines) != len(input_lines):
                raise ValueError(
                    "Make sure that the number of slot lines match the "
                    "number of intent lines. There should be a 1-1 "
                    "correspondence between every slot and intent lines.")

            dataset = list(zip(slot_lines, input_lines))

            raw_slots, queries, raw_intents = [], [], []
            for slot_line, input_line in dataset:
                slot_list = [int(slot) for slot in slot_line.strip().split()]
                raw_slots.append(slot_list)
                parts = input_line.strip().split()
                raw_intents.append(int(parts[-1]))
                queries.append(' '.join(parts[:-1]))

            infold = input_file[:input_file.rfind('/')]

            logging.info(f'Three most popular intents during {mode}ing')
            total_intents, intent_label_freq = get_label_stats(
                raw_intents, infold + f'/{mode}_intent_stats.tsv')
            merged_slots = itertools.chain.from_iterable(raw_slots)

            logging.info(f'Three most popular slots during {mode}ing')
            slots_total, slots_label_freq = get_label_stats(
                merged_slots, infold + f'/{mode}_slot_stats.tsv')

            if mode == 'train':
                self.slot_weights = calc_class_weights(slots_label_freq)
                logging.info(f'Slot weights are - {self.slot_weights}')

                self.intent_weights = calc_class_weights(intent_label_freq)
                logging.info(f'Intent weights are - {self.intent_weights}')

            logging.info(f'Total intents - {total_intents}')
            logging.info(f'Intent label frequency - {intent_label_freq}')
            logging.info(f'Total Slots - {slots_total}')
            logging.info(f'Slots label frequency - {slots_label_freq}')

        if pad_label != -1:
            self.pad_label = pad_label
        else:
            if none_slot_label not in slots:
                raise ValueError(f'none_slot_label {none_slot_label} not '
                                 f'found in {self.slot_dict_file}.')
            self.pad_label = slots[none_slot_label]

Example #4

0

Show file

    def merge(self, data_dir, subdirs, dataset_name, modes=['train', 'test']):
        outfold = f'{data_dir}/{dataset_name}'
        if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
            logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold))
            slots = get_vocab(f'{outfold}/dict.slots.csv')
            none_slot = 0
            for key in slots:
                if slots[key] == 'O':
                    none_slot = key
                    break
            return outfold, int(none_slot)

        os.makedirs(outfold, exist_ok=True)

        data_files, slot_files = {}, {}
        for mode in modes:
            data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w')
            data_files[mode].write('sentence\tlabel\n')
            slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        intents, slots = {}, {}
        intent_shift, slot_shift = 0, 0
        none_intent, none_slot = -1, -1

        for subdir in subdirs:
            curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv')
            curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv')

            for key in curr_intents:
                if intent_shift > 0 and curr_intents[key] == 'O':
                    continue
                if curr_intents[key] == 'O' and intent_shift == 0:
                    none_intent = int(key)
                intents[int(key) + intent_shift] = curr_intents[key]

            for key in curr_slots:
                if slot_shift > 0 and curr_slots[key] == 'O':
                    continue
                if slot_shift == 0 and curr_slots[key] == 'O':
                    none_slot = int(key)
                slots[int(key) + slot_shift] = curr_slots[key]

            for mode in modes:
                with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f:
                    for line in f.readlines()[1:]:
                        text, label = line.strip().split('\t')
                        label = int(label)
                        if curr_intents[label] == 'O':
                            label = none_intent
                        else:
                            label = label + intent_shift
                        data_files[mode].write(f'{text}\t{label}\n')

                with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f:
                    for line in f.readlines():
                        labels = [int(label) for label in line.strip().split()]
                        shifted_labels = []
                        for label in labels:
                            if curr_slots[label] == 'O':
                                shifted_labels.append(none_slot)
                            else:
                                shifted_labels.append(label + slot_shift)
                        slot_files[mode].write(list2str(shifted_labels) + '\n')

            intent_shift += len(curr_intents)
            slot_shift += len(curr_slots)

        write_vocab_in_order(intents, f'{outfold}/dict.intents.csv')
        write_vocab_in_order(slots, f'{outfold}/dict.slots.csv')
        return outfold, none_slot

Example #5

0

Show file

File: joint_intent_slot_descriptor.py Project: magictron/NeMo

    def __init__(self, data_dir, none_slot_label='O', pad_label=-1):
        if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
            raise FileNotFoundError(
                "Make sure that your data follows the standard format "
                "supported by JointIntentSlotDataset. Your data must "
                "contain dict.intents.csv and dict.slots.csv.")

        self.data_dir = data_dir
        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
        self.slot_dict_file = self.data_dir + '/dict.slots.csv'

        self.num_intents = len(get_vocab(self.intent_dict_file))
        slots = JointIntentSlotDataDesc.label2idx(self.slot_dict_file)
        self.num_slots = len(slots)

        for mode in ['train', 'test', 'dev']:
            if not if_exist(self.data_dir, [f'{mode}.tsv']):
                logging.info(f' Stats calculation for {mode} mode'
                             f' is skipped as {mode}.tsv was not found.')
                continue

            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
            with open(slot_file, 'r') as f:
                slot_lines = f.readlines()

            input_file = f'{self.data_dir}/{mode}.tsv'
            with open(input_file, 'r') as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            if len(slot_lines) != len(input_lines):
                raise ValueError(
                    "Make sure that the number of slot lines match the "
                    "number of intent lines. There should be a 1-1 "
                    "correspondence between every slot and intent lines.")

            dataset = list(zip(slot_lines, input_lines))

            raw_slots, queries, raw_intents = [], [], []
            for slot_line, input_line in dataset:
                slot_list = [int(slot) for slot in slot_line.strip().split()]
                raw_slots.append(slot_list)
                parts = input_line.strip().split()
                raw_intents.append(int(parts[-1]))
                queries.append(' '.join(parts[:-1]))

            infold = input_file[:input_file.rfind('/')]

            logging.info(f'Three most popular intents during {mode}ing')
            total_intents, intent_label_freq = get_label_stats(
                raw_intents, infold + f'/{mode}_intent_stats.tsv')
            merged_slots = itertools.chain.from_iterable(raw_slots)

            logging.info(f'Three most popular slots during {mode}ing')
            slots_total, slots_label_freq = get_label_stats(
                merged_slots, infold + f'/{mode}_slot_stats.tsv')

            if mode == 'train':
                self.slot_weights = calc_class_weights(slots_label_freq)
                logging.info(f'Slot weights are - {self.slot_weights}')

                self.intent_weights = calc_class_weights(intent_label_freq)
                logging.info(f'Intent weights are - {self.intent_weights}')

            logging.info(f'Total intents - {total_intents}')
            logging.info(f'Intent label frequency - {intent_label_freq}')
            logging.info(f'Total Slots - {slots_total}')
            logging.info(f'Slots label frequency - {slots_label_freq}')

        if pad_label != -1:
            self.pad_label = pad_label
        else:
            if none_slot_label not in slots:
                raise ValueError(f'none_slot_label {none_slot_label} not '
                                 f'found in {self.slot_dict_file}.')
            self.pad_label = slots[none_slot_label]