Example #1
0
        def get_stats_and_save(all_labels, label_ids, name):
            infold = text_file[:text_file.rfind('/')]
            merged_labels = itertools.chain.from_iterable(all_labels)
            logging.info('Three most popular labels')
            _, label_frequencies = get_label_stats(
                merged_labels, infold + '/label_count_' + name + '.tsv')

            out = open(os.path.join(infold, name + '_label_ids.csv'), 'w')
            labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1]))
            out.write('\n'.join(labels))
            logging.info(f'Labels: {label_ids}')
            logging.info(f'Labels mapping saved to : {out.name}')

            return label_frequencies
    def __init__(self, data_dir, modes=['train', 'test', 'dev']):
        self.data_dir = data_dir

        max_label_id = 0
        for mode in modes:
            if not if_exist(self.data_dir, [f'{mode}.tsv']):
                logging.info(
                    f'Stats calculation for {mode} mode is skipped as {mode}.tsv was not found.'
                )
                continue

            input_file = f'{self.data_dir}/{mode}.tsv'
            with open(input_file, 'r') as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            try:
                int(input_lines[0].strip().split()[-1])
            except ValueError:
                logging.warning(f'No numerical labels found for {mode}.tsv.')
                raise

            queries, raw_sentences = [], []
            for input_line in input_lines:
                parts = input_line.strip().split()
                label = int(parts[-1])
                raw_sentences.append(label)
                queries.append(' '.join(parts[:-1]))

            infold = input_file[:input_file.rfind('/')]

            logging.info(f'Three most popular classes in {mode} dataset')
            total_sents, sent_label_freq, max_id = get_label_stats(
                raw_sentences, infold + f'/{mode}_sentence_stats.tsv')
            max_label_id = max(max_label_id, max_id)

            if mode == 'train':
                class_weights_dict = get_freq_weights(sent_label_freq)
                logging.info(f'Class Weights: {class_weights_dict}')

            logging.info(f'Total Sentences: {total_sents}')
            logging.info(f'Sentence class frequencies - {sent_label_freq}')

        self.class_weights = fill_class_weights(class_weights_dict,
                                                max_label_id)

        self.num_labels = max_label_id + 1
Example #3
0
    def __init__(
        self,
        text_file,
        label_file,
        max_seq_length,
        tokenizer,
        num_samples=-1,
        shuffle=False,
        pad_label='O',
        label_ids=None,
        ignore_extra_tokens=False,
        ignore_start_end=False,
        use_cache=False,
    ):

        if use_cache:
            # Cache features
            data_dir = os.path.dirname(text_file)
            filename = os.path.basename(text_file)

            if not filename.endswith('.txt'):
                raise ValueError("{text_file} should have extension .txt")

            features_pkl = os.path.join(data_dir, filename[:-4] + "_features.pkl")
            label_ids_pkl = os.path.join(data_dir, "label_ids.pkl")

        if use_cache and os.path.exists(features_pkl) and os.path.exists(label_ids_pkl):
            # If text_file was already processed, load from pickle
            features = pickle.load(open(features_pkl, 'rb'))
            logging.info(f'features restored from {features_pkl}')

            label_ids = pickle.load(open(label_ids_pkl, 'rb'))
            logging.info(f'Labels to ids dict restored from {label_ids_pkl}')
        else:
            if num_samples == 0:
                raise ValueError("num_samples has to be positive", num_samples)

            with open(text_file, 'r') as f:
                text_lines = f.readlines()

            # Collect all possible labels
            unique_labels = set([])
            labels_lines = []
            with open(label_file, 'r') as f:
                for line in f:
                    line = line.strip().split()
                    labels_lines.append(line)
                    unique_labels.update(line)

            if len(labels_lines) != len(text_lines):
                raise ValueError("Labels file should contain labels for every word")

            if shuffle or num_samples > 0:
                dataset = list(zip(text_lines, labels_lines))
                random.shuffle(dataset)

                if num_samples > 0:
                    dataset = dataset[:num_samples]

                dataset = list(zip(*dataset))
                text_lines = dataset[0]
                labels_lines = dataset[1]

            # for dev/test sets use label mapping from training set
            if label_ids:
                if len(label_ids) != len(unique_labels):
                    logging.warning(
                        f'Not all labels from the specified'
                        + ' label_ids dictionary are present in the'
                        + ' current dataset. Using the provided'
                        + ' label_ids dictionary.'
                    )
                else:
                    logging.info(f'Using the provided label_ids dictionary.')
            else:
                logging.info(
                    f'Creating a new label to label_id dictionary.'
                    + ' It\'s recommended to use label_ids generated'
                    + ' during training for dev/test sets to avoid'
                    + ' errors if some labels are not'
                    + ' present in the dev/test sets.'
                    + ' For training set label_ids should be None.'
                )

                label_ids = {pad_label: 0}
                if pad_label in unique_labels:
                    unique_labels.remove(pad_label)
                for label in sorted(unique_labels):
                    label_ids[label] = len(label_ids)

            features = get_features(
                text_lines,
                max_seq_length,
                tokenizer,
                pad_label=pad_label,
                raw_labels=labels_lines,
                label_ids=label_ids,
                ignore_extra_tokens=ignore_extra_tokens,
                ignore_start_end=ignore_start_end,
            )

            if use_cache:
                pickle.dump(features, open(features_pkl, "wb"))
                logging.info(f'features saved to {features_pkl}')

                pickle.dump(label_ids, open(label_ids_pkl, "wb"))
                logging.info(f'labels to ids dict saved to {label_ids_pkl}')

        self.all_input_ids = features[0]
        self.all_segment_ids = features[1]
        self.all_input_mask = features[2]
        self.all_loss_mask = features[3]
        self.all_subtokens_mask = features[4]
        self.all_labels = features[5]
        self.label_ids = label_ids

        infold = text_file[: text_file.rfind('/')]
        merged_labels = itertools.chain.from_iterable(self.all_labels)
        logging.info('Three most popular labels')
        _, self.label_frequencies = datasets_utils.get_label_stats(merged_labels, infold + '/label_stats.tsv')

        # save label_ids
        out = open(infold + '/label_ids.csv', 'w')
        labels, _ = zip(*sorted(self.label_ids.items(), key=lambda x: x[1]))
        out.write('\n'.join(labels))
        logging.info(f'Labels: {self.label_ids}')
        logging.info(f'Labels mapping saved to : {out.name}')
    def __init__(self, data_dir, none_slot_label='O', pad_label=-1):
        if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
            raise FileNotFoundError(
                "Make sure that your data follows the standard format "
                "supported by JointIntentSlotDataset. Your data must "
                "contain dict.intents.csv and dict.slots.csv.")

        self.data_dir = data_dir
        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
        self.slot_dict_file = self.data_dir + '/dict.slots.csv'

        self.intents_label_ids = JointIntentSlotDataDesc.label2idx(
            self.intent_dict_file)
        self.num_intents = len(self.intents_label_ids)
        self.slots_label_ids = JointIntentSlotDataDesc.label2idx(
            self.slot_dict_file)
        self.num_slots = len(self.slots_label_ids)

        for mode in ['train', 'test', 'dev']:
            if not if_exist(self.data_dir, [f'{mode}.tsv']):
                logging.info(f' Stats calculation for {mode} mode'
                             f' is skipped as {mode}.tsv was not found.')
                continue
            logging.info(f' Stats calculating for {mode} mode...')
            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
            with open(slot_file, 'r') as f:
                slot_lines = f.readlines()

            input_file = f'{self.data_dir}/{mode}.tsv'
            with open(input_file, 'r') as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            if len(slot_lines) != len(input_lines):
                raise ValueError(
                    "Make sure that the number of slot lines match the "
                    "number of intent lines. There should be a 1-1 "
                    "correspondence between every slot and intent lines.")

            dataset = list(zip(slot_lines, input_lines))

            raw_slots, queries, raw_intents = [], [], []
            for slot_line, input_line in dataset:
                slot_list = [int(slot) for slot in slot_line.strip().split()]
                raw_slots.append(slot_list)
                parts = input_line.strip().split()
                raw_intents.append(int(parts[-1]))
                queries.append(' '.join(parts[:-1]))

            infold = self.data_dir

            logging.info(f'Three most popular intents in {mode} mode:')
            total_intents, intent_label_freq = get_label_stats(
                raw_intents, infold + f'/{mode}_intent_stats.tsv')
            merged_slots = itertools.chain.from_iterable(raw_slots)

            logging.info(f'Three most popular slots in {mode} mode:')
            slots_total, slots_label_freq = get_label_stats(
                merged_slots, infold + f'/{mode}_slot_stats.tsv')

            if mode == 'train':
                self.slot_weights = calc_class_weights(slots_label_freq)
                logging.info(f'Slot weights are - {self.slot_weights}')

                self.intent_weights = calc_class_weights(intent_label_freq)
                logging.info(f'Intent weights are - {self.intent_weights}')

            logging.info(f'Total intents - {total_intents}')
            logging.info(f'Intent label frequency - {intent_label_freq}')
            logging.info(f'Total Slots - {slots_total}')
            logging.info(f'Slots label frequency - {slots_label_freq}')

        if pad_label != -1:
            self.pad_label = pad_label
        else:
            if none_slot_label not in self.slots_label_ids:
                raise ValueError(f'none_slot_label {none_slot_label} not '
                                 f'found in {self.slot_dict_file}.')
            self.pad_label = self.slots_label_ids[none_slot_label]
Example #5
0
    def __init__(self,
                 data_dir,
                 do_lower_case=False,
                 dataset_name='default',
                 none_slot_label='O',
                 pad_label=-1):
        if dataset_name == 'atis':
            self.data_dir = process_atis(data_dir, do_lower_case)
        elif dataset_name == 'snips-atis':
            self.data_dir, self.pad_label = self.merge(data_dir, [
                'ATIS/nemo-processed-uncased',
                'snips/nemo-processed-uncased/all'
            ], dataset_name)
        elif dataset_name == 'dialogflow':
            self.data_dir = process_dialogflow(data_dir, do_lower_case)
        elif dataset_name == 'mturk-processed':
            self.data_dir = process_mturk(data_dir, do_lower_case)
        elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']):
            self.data_dir = process_snips(data_dir, do_lower_case)
            if dataset_name.endswith('light'):
                self.data_dir = f'{self.data_dir}/light'
            elif dataset_name.endswith('speak'):
                self.data_dir = f'{self.data_dir}/speak'
            elif dataset_name.endswith('all'):
                self.data_dir = f'{self.data_dir}/all'
        elif dataset_name.startswith('jarvis'):
            self.data_dir = process_jarvis_datasets(
                data_dir,
                do_lower_case,
                dataset_name,
                modes=["train", "test", "eval"],
                ignore_prev_intent=False)
        else:
            if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']):
                raise FileNotFoundError(
                    "Make sure that your data follows the standard format "
                    "supported by JointIntentSlotDataset. Your data must "
                    "contain dict.intents.csv and dict.slots.csv.")
            self.data_dir = data_dir

        self.intent_dict_file = self.data_dir + '/dict.intents.csv'
        self.slot_dict_file = self.data_dir + '/dict.slots.csv'
        self.num_intents = len(get_vocab(self.intent_dict_file))
        slots = label2idx(self.slot_dict_file)
        self.num_slots = len(slots)

        for mode in ['train', 'test', 'eval']:

            if not if_exist(self.data_dir, [f'{mode}.tsv']):
                logging.info(f' Stats calculation for {mode} mode'
                             f' is skipped as {mode}.tsv was not found.')
                continue

            slot_file = f'{self.data_dir}/{mode}_slots.tsv'
            with open(slot_file, 'r') as f:
                slot_lines = f.readlines()

            input_file = f'{self.data_dir}/{mode}.tsv'
            with open(input_file, 'r') as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            if len(slot_lines) != len(input_lines):
                raise ValueError(
                    "Make sure that the number of slot lines match the "
                    "number of intent lines. There should be a 1-1 "
                    "correspondence between every slot and intent lines.")

            dataset = list(zip(slot_lines, input_lines))

            raw_slots, queries, raw_intents = [], [], []
            for slot_line, input_line in dataset:
                slot_list = [int(slot) for slot in slot_line.strip().split()]
                raw_slots.append(slot_list)
                parts = input_line.strip().split()
                raw_intents.append(int(parts[-1]))
                queries.append(' '.join(parts[:-1]))

            infold = input_file[:input_file.rfind('/')]

            logging.info(f'Three most popular intents during {mode}ing')
            total_intents, intent_label_freq = get_label_stats(
                raw_intents, infold + f'/{mode}_intent_stats.tsv')
            merged_slots = itertools.chain.from_iterable(raw_slots)

            logging.info(f'Three most popular slots during {mode}ing')
            slots_total, slots_label_freq = get_label_stats(
                merged_slots, infold + f'/{mode}_slot_stats.tsv')

            if mode == 'train':
                self.slot_weights = calc_class_weights(slots_label_freq)
                logging.info(f'Slot weights are - {self.slot_weights}')

                self.intent_weights = calc_class_weights(intent_label_freq)
                logging.info(f'Intent weights are - {self.intent_weights}')

            logging.info(f'Total intents - {total_intents}')
            logging.info(f'Intent label frequency - {intent_label_freq}')
            logging.info(f'Total Slots - {slots_total}')
            logging.info(f'Slots label frequency - {slots_label_freq}')

        if pad_label != -1:
            self.pad_label = pad_label
        else:
            if none_slot_label not in slots:
                raise ValueError(f'none_slot_label {none_slot_label} not '
                                 f'found in {self.slot_dict_file}.')
            self.pad_label = slots[none_slot_label]
    def __init__(self, dataset_name, data_dir, do_lower_case, modes=['train', 'test', 'eval']):
        if dataset_name == 'sst-2':
            self.data_dir = process_sst_2(data_dir)
            self.num_labels = 2
            self.eval_file = self.data_dir + '/dev.tsv'
        elif dataset_name == 'imdb':
            self.num_labels = 2
            self.data_dir = process_imdb(data_dir, do_lower_case)
            self.eval_file = self.data_dir + '/test.tsv'
        elif dataset_name == 'thucnews':
            self.num_labels = 14
            self.data_dir = process_thucnews(data_dir)
            self.eval_file = self.data_dir + '/test.tsv'
        elif dataset_name.startswith('nlu-'):
            if dataset_name.endswith('chat'):
                self.data_dir = f'{data_dir}/ChatbotCorpus.json'
                self.num_labels = 2
            elif dataset_name.endswith('ubuntu'):
                self.data_dir = f'{data_dir}/AskUbuntuCorpus.json'
                self.num_labels = 5
            elif dataset_name.endswith('web'):
                data_dir = f'{data_dir}/WebApplicationsCorpus.json'
                self.num_labels = 8
            self.data_dir = process_nlu(data_dir, do_lower_case, dataset_name=dataset_name)
            self.eval_file = self.data_dir + '/test.tsv'
        elif dataset_name.startswith('jarvis'):
            self.data_dir = process_jarvis_datasets(
                data_dir, do_lower_case, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False
            )

            intents = get_intent_labels(f'{self.data_dir}/dict.intents.csv')
            self.num_labels = len(intents)
        elif dataset_name != 'default_format':
            raise ValueError(
                "Looks like you passed a dataset name that isn't "
                + "already supported by NeMo. Please make sure "
                + "that you build the preprocessing method for it. "
                + "default_format assumes that a data file has a header and each line of the file follows "
                + "the format: text [TAB] label. Label is assumed to be an integer."
            )

        self.train_file = self.data_dir + '/train.tsv'

        for mode in modes:
            if not if_exist(self.data_dir, [f'{mode}.tsv']):
                logging.info(f'Stats calculation for {mode} mode is skipped as {mode}.tsv was not found.')
                continue

            input_file = f'{self.data_dir}/{mode}.tsv'
            with open(input_file, 'r') as f:
                input_lines = f.readlines()[1:]  # Skipping headers at index 0

            try:
                int(input_lines[0].strip().split()[-1])
            except ValueError:
                logging.warning(f'No numerical labels found for {mode}.tsv in {dataset_name} dataset.')
                raise

            queries, raw_sentences = [], []
            for input_line in input_lines:
                parts = input_line.strip().split()
                raw_sentences.append(int(parts[-1]))
                queries.append(' '.join(parts[:-1]))

            infold = input_file[: input_file.rfind('/')]

            logging.info(f'Three most popular classes in {mode} dataset')
            total_sents, sent_label_freq = get_label_stats(raw_sentences, infold + f'/{mode}_sentence_stats.tsv')

            if mode == 'train':
                self.class_weights = calc_class_weights(sent_label_freq)
                logging.info(f'Class weights are - {self.class_weights}')

            logging.info(f'Total Sentences - {total_sents}')
            logging.info(f'Sentence class frequencies - {sent_label_freq}')