Beispiel #1
0
def process_dialogflow(infold, outfold, dev_split=0.1):
    if not os.path.exists(infold):
        link = 'www.dialogflow.com'
        raise ValueError(f'Data not found at {infold}. '
                         f'Export your dialogflow data from'
                         f'{link} and unzip at {infold}.')

    if if_exist(outfold, [f'{mode}.tsv' for mode in ['train', 'test']]):
        logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold))
        return

    os.makedirs(outfold, exist_ok=True)

    files = get_intent_query_files_dialogflow(infold)
    slot_labels = get_slots_dialogflow(files)
    intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow(
        files, slot_labels)
    train_queries, train_slots, test_queries, test_slots = partition_data(
        intent_queries, slot_tags, split=dev_split)

    write_files(train_queries, f'{outfold}/train.tsv')
    write_files(train_slots, f'{outfold}/train_slots.tsv')

    write_files(test_queries, f'{outfold}/test.tsv')
    write_files(test_slots, f'{outfold}/test_slots.tsv')

    write_files(slot_labels, f'{outfold}/dict.slots.csv')
    write_files(intent_names, f'{outfold}/dict.intents.csv')
Beispiel #2
0
def process_assistant(infold, outfold, modes=['train', 'test']):
    """
    https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes
    about 25 thousand examples with 66 various multi-domain intents and 57 entity types.
    """
    if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format('robot', outfold))
        return outfold

    logging.info(
        f'Processing assistant commands dataset and store at {outfold}')
    os.makedirs(outfold, exist_ok=True)

    # copy train/test files to the convenient directory to work with
    copy_input_files(infold)
    infold += "/dataset"

    # get list of intents from train folder (test folder supposed to be the same)
    intent_names = get_intents(infold + "/trainset")
    write_files(intent_names, f'{outfold}/dict.intents.csv')

    # get all train and test queries with their intent
    for mode in modes:
        intent_queries = get_intent_queries(infold, intent_names, mode)
        write_files(intent_queries, f'{outfold}/{mode}.tsv')

    # get list of all unique slots in training and testing files
    slot_types = get_slots(infold, modes)
    write_files(slot_types, f'{outfold}/dict.slots.csv')

    # create files of slot queries
    slot_dict = {k: v for v, k in enumerate(slot_types)}
    for mode in modes:
        slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names)
        write_files(slot_queries, f'{outfold}/{mode}_slots.tsv')
Beispiel #3
0
    def create_vocab_mlm(
        self,
        data_dir,
        vocab_size,
        sample_size,
        special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'],
        train_file='',
    ):
        vocab = special_tokens[:]
        bert_dir = f'{data_dir}/bert'
        if if_exist(bert_dir, ['tokenizer.model']):
            logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir))
            return data_dir, f'{bert_dir}/tokenizer.model'
        logging.info(f'Processing WikiText dataset and store at {bert_dir}')
        os.makedirs(bert_dir, exist_ok=True)

        if not train_file:
            files = glob.glob(f'{data_dir}/*.txt')
            train_file = f'{bert_dir}/merged.txt'
            logging.info(f"Merging {len(files)} txt files into {train_file}")

            with open(train_file, "w") as merged:
                for file in tqdm(files):
                    with open(file, 'r') as inf:
                        content = inf.read().strip()
                    merged.write(content + '\n\n\n')
        else:
            train_file = f'{data_dir}/{train_file}'

        cmd = (f"--input={train_file} --model_prefix={bert_dir}/tokenizer "
               f"--vocab_size={vocab_size - len(vocab)} "
               f"--input_sentence_size={sample_size} "
               f"--shuffle_input_sentence=true --hard_vocab_limit=false "
               f"--bos_id=-1 --eos_id=-1")

        SPT.Train(cmd)

        # Add BERT control symbols
        tokens = []

        with open(f"{bert_dir}/tokenizer.vocab", "r") as f:
            f.readline()  # skip first <unk> token

            # Read tokens from each line and parse for vocab
            for line in f:
                piece = line.split("\t")[0]
                token = piece[1:] if piece.startswith("▁") else f"##{piece}"
                tokens.append(token)

        vocab.extend(tokens)

        # Save vocabulary to output file
        with open(f'{bert_dir}/vocab.txt', "w") as f:
            for token in vocab:
                f.write(f"{token}\n".format())
        return data_dir, f'{bert_dir}/tokenizer.model'
Beispiel #4
0
def process_thucnews(data_dir):
    modes = ['train', 'test']
    train_size = 0.8
    if not os.path.exists(data_dir):
        link = 'thuctc.thunlp.org/'
        raise ValueError(f'Data not found at {data_dir}. '
                         f'Please download THUCNews from {link}.')

    outfold = f'{data_dir}/nemo-processed-thucnews'

    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format('THUCNews', outfold))
        return outfold
    logging.info(f'Processing THUCNews dataset and store at {outfold}')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}

    for mode in modes:
        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'),
                              'a+',
                              encoding='utf-8')
        outfiles[mode].write('sentence\tlabel\n')
    categories = [
        '体育', '娱乐', '家居', '彩票', '房产', '教育', '时尚', '时政', '星座', '游戏', '社会', '科技',
        '股票', '财经'
    ]
    for category in categories:
        label = categories.index(category)
        category_files = glob.glob(f'{data_dir}/{category}/*.txt')
        test_num = int(len(category_files) * (1 - train_size))
        test_files = category_files[:test_num]
        train_files = category_files[test_num:]

        for mode in modes:
            logging.info(f'Processing {mode} data of the category {category}')
            if mode == 'test':
                files = test_files
            else:
                files = train_files

            if len(files) == 0:
                logging.info(f'Skipping category {category} for {mode} mode')
                continue

            for file in tqdm.tqdm(files):
                with open(file, 'r', encoding='utf-8') as f:
                    news = f.read().strip().replace('\r', '')
                    news = news.replace('\n', '').replace('\t', ' ')
                    outfiles[mode].write(f'{news}\t{label}\n')
    for mode in modes:
        outfiles[mode].close()

    return outfold
Beispiel #5
0
def process_snips(data_dir, uncased, modes=['train', 'test'], dev_split=0.1):
    if not os.path.exists(data_dir):
        link = 'www.github.com/snipsco/spoken-language'
        '-understanding-research-datasets'
        raise ValueError(
            f'Data not found at {data_dir}. '
            f'Resquest to download the SNIPS dataset from {link}.')

    outfold = f'{data_dir}/nemo-processed'

    if uncased:
        outfold = f'{outfold}-uncased'

    exist = True
    for dataset in ['light', 'speak', 'all']:
        if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]):
            logging.info(
                DATABASE_EXISTS_TMP.format('SNIPS-' + dataset.upper(),
                                           outfold))
        else:
            exist = False
    if exist:
        return outfold

    logging.info(f'Processing SNIPS dataset and store at {outfold}')

    os.makedirs(outfold, exist_ok=True)

    speak_dir = 'smart-speaker-en-close-field'
    light_dir = 'smart-lights-en-close-field'

    light_files = [f'{data_dir}/{light_dir}/dataset.json']
    speak_files = [f'{data_dir}/{speak_dir}/training_dataset.json']
    speak_files.append(f'{data_dir}/{speak_dir}/test_dataset.json')

    light_train, light_dev, light_slots, light_intents = get_dataset(
        light_files, dev_split)
    speak_train, speak_dev, speak_slots, speak_intents = get_dataset(
        speak_files)

    create_dataset(light_train, light_dev, light_slots, light_intents, uncased,
                   f'{outfold}/light')
    create_dataset(speak_train, speak_dev, speak_slots, speak_intents, uncased,
                   f'{outfold}/speak')
    create_dataset(
        light_train + speak_train,
        light_dev + speak_dev,
        light_slots | speak_slots,
        light_intents | speak_intents,
        uncased,
        f'{outfold}/all',
    )

    return outfold
Beispiel #6
0
def process_atis(infold, uncased, modes=['train', 'test'], dev_split=0):
    """ MSFT's dataset, processed by Kaggle
    https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk
    """
    outfold = f'{infold}/nemo-processed'
    vocab = get_vocab(f'{infold}/atis.dict.vocab.csv')

    if uncased:
        outfold = f'{outfold}-uncased'

    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold))
        return outfold
    logging.info(f'Processing ATIS dataset and store at {outfold}')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}

    for mode in modes:
        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')
        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines()
        intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines()
        slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines()

        for i, query in enumerate(queries):
            sentence = ids2text(query.strip().split()[1:-1], vocab)
            outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n')
            slot = ' '.join(slots[i].strip().split()[1:-1])
            outfiles[mode + '_slots'].write(slot + '\n')

    shutil.copyfile(f'{infold}/atis.dict.intent.csv',
                    f'{outfold}/dict.intents.csv')
    shutil.copyfile(f'{infold}/atis.dict.slots.csv',
                    f'{outfold}/dict.slots.csv')
    for mode in modes:
        outfiles[mode].close()

    return outfold
Beispiel #7
0
def process_imdb(data_dir, uncased, modes=['train', 'test']):
    if not os.path.exists(data_dir):
        link = 'www.kaggle.com/iarunava/imdb-movie-reviews-dataset'
        raise ValueError(f'Data not found at {data_dir}. '
                         f'Please download IMDB from {link}.')

    outfold = f'{data_dir}/nemo-processed'

    if uncased:
        outfold = f'{outfold}_uncased'

    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format('IMDB', outfold))
        return outfold
    logging.info(f'Processing IMDB dataset and store at {outfold}')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}

    for mode in modes:
        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')
        for sent in ['neg', 'pos']:
            if sent == 'neg':
                label = 0
            else:
                label = 1
            files = glob.glob(f'{data_dir}/{mode}/{sent}/*.txt')
            for file in files:
                with open(file, 'r') as f:
                    review = f.read().strip()
                if uncased:
                    review = review.lower()
                review = review.replace("<br />", "")
                outfiles[mode].write(f'{review}\t{label}\n')
    for mode in modes:
        outfiles[mode].close()

    return outfold
Beispiel #8
0
def process_mturk(infold, outfold, modes=['train', 'test']):
    if not os.path.exists(infold):
        link = 'www.mturk.com'
        raise ValueError(
            f'Data not found at {infold}. ' f'Export your mturk data from' f'{link} and unzip at {infold}.'
        )

    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold))
        return

    logging.info(f'Processing dataset from mturk and storing at {outfold}')

    os.makedirs(outfold, exist_ok=True)

    classification_data_file = f'{infold}/classification.csv'
    annotation_data_file = f'{infold}/annotation.manifest'

    if not os.path.exists(classification_data_file):
        raise FileNotFoundError(f'File not found ' f'at {classification_data_file}')

    if not os.path.exists(annotation_data_file):
        raise FileNotFoundError(f'File not found at {annotation_data_file}')

    utterances = []
    utterances = read_csv(classification_data_file)

    # This function assumes that the intent classification data has been
    # reviewed and cleaned and only one label per utterance is present.
    agreed_all, intent_names = get_intents_mturk(utterances, outfold)

    with open(annotation_data_file, 'r') as f:
        slot_annotations = f.readlines()

    # This function assumes that the preprocess step would have made
    # the task_name of all the annotations generic
    task_name = 'retail-combined'

    # It is assumed that every utterances will have corresponding
    # slot annotation information
    if len(slot_annotations) < len(agreed_all):
        raise ValueError(f'Every utterance must have corresponding' f'slot annotation information')

    slot_labels, intent_queries, slot_tags = process_intent_slot_mturk(
        slot_annotations, agreed_all, intent_names, task_name
    )

    assert len(slot_tags) == len(intent_queries)

    dev_split = 0.1

    train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split)

    write_files(train_queries, f'{outfold}/train.tsv')
    write_files(train_slots, f'{outfold}/train_slots.tsv')

    write_files(test_queries, f'{outfold}/test.tsv')
    write_files(test_slots, f'{outfold}/test_slots.tsv')

    write_files(slot_labels, f'{outfold}/dict.slots.csv')
    write_files(intent_names, f'{outfold}/dict.intents.csv')
Beispiel #9
0
def process_jarvis_datasets(infold, uncased, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False):
    """ process and convert Jarvis datasets into NeMo's BIO format
    """
    outfold = f'{infold}/{dataset_name}-nemo-processed'
    infold = f'{infold}/'

    if uncased:
        outfold = f'{outfold}-uncased'

    if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']):
        logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold))
        return outfold

    logging.info(f'Processing {dataset_name} dataset and store at {outfold}')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}
    intents_list = {}
    slots_list = {}
    slots_list_all = {}

    outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w')
    outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w')

    outfiles['dict_slots'].write('O\n')
    slots_list["O"] = 0
    slots_list_all["O"] = 0

    for mode in modes:
        if if_exist(outfold, [f'{mode}.tsv']):
            logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode))
            continue

        if not if_exist(infold, [f'{mode}.tsv']):
            logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.')
            continue

        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')
        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w')

        queries = open(f'{infold}/{mode}.tsv', 'r').readlines()

        for i, query in enumerate(queries):
            line_splits = query.strip().split("\t")
            if len(line_splits) == 3:
                intent_str, slot_tags_str, sentence = line_splits
            else:
                intent_str, sentence = line_splits
                slot_tags_str = ""

            if intent_str not in intents_list:
                intents_list[intent_str] = len(intents_list)
                outfiles['dict_intents'].write(f'{intent_str}\n')

            if ignore_prev_intent:
                start_token = 2
            else:
                start_token = 1
            sentence_cld = " ".join(sentence.strip().split()[start_token:-1])
            outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n')

            slot_tags_list = []
            if slot_tags_str.strip():
                slot_tags = slot_tags_str.strip().split(",")
                for st in slot_tags:
                    if not st.strip():
                        continue
                    [start_i, end_i, slot_name] = st.strip().split(":")
                    slot_tags_list.append([int(start_i), int(end_i), slot_name])
                    if slot_name not in slots_list:
                        slots_list[slot_name] = len(slots_list)
                        slots_list_all[f'B-{slot_name}'] = len(slots_list_all)
                        slots_list_all[f'I-{slot_name}'] = len(slots_list_all)
                        outfiles['dict_slots'].write(f'B-{slot_name}\n')
                        outfiles['dict_slots'].write(f'I-{slot_name}\n')

            slot_tags_list.sort(key=lambda x: x[0])
            slots = []
            processed_index = 0
            for tag_start, tag_end, tag_str in slot_tags_list:
                if tag_start > processed_index:
                    words_list = sentence[processed_index:tag_start].strip().split()
                    slots.extend([str(slots_list_all['O'])] * len(words_list))
                words_list = sentence[tag_start:tag_end].strip().split()
                slots.append(str(slots_list_all[f'B-{tag_str}']))
                slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1))
                processed_index = tag_end

            if processed_index < len(sentence):
                words_list = sentence[processed_index:].strip().split()
                slots.extend([str(slots_list_all['O'])] * len(words_list))

            slots = slots[1:-1]
            slot = ' '.join(slots)
            outfiles[mode + '_slots'].write(slot + '\n')

        outfiles[mode + '_slots'].close()
        outfiles[mode].close()

    outfiles['dict_slots'].close()
    outfiles['dict_intents'].close()

    return outfold
Beispiel #10
0
def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ubuntu'):
    """ Dataset has to be of:
    - ubuntu
    - chat
    - web
    """

    if not os.path.exists(filename):
        link = 'https://github.com/sebischair/NLU-Evaluation-Corpora'
        raise ValueError(f'Data not found at {filename}. ' f'Please download IMDB from {link}.')

    if dataset_name == 'nlu-ubuntu':
        INTENT = {'makeupdate': 1, 'setupprinter': 2, 'shutdowncomputer': 3, 'softwarerecommendation': 4, 'none': 0}
    elif dataset_name == 'nlu-chat':
        INTENT = {'departuretime': 0, 'findconnection': 1}
    elif dataset_name == 'nlu-web':
        INTENT = {
            'changepassword': 1,
            'deleteaccount': 2,
            'downloadvideo': 3,
            'exportdata': 4,
            'filterspam': 5,
            'findalternative': 6,
            'syncaccounts': 7,
            'none': 0,
        }
    else:
        raise ValueError(f'{dataset_name}: Invalid dataset name')

    infold = filename[: filename.rfind('/')]
    outfold = f'{infold}/{dataset_name}-nemo-processed'

    if uncased:
        outfold = f'{outfold}_uncased'

    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
        logging.info(DATABASE_EXISTS_TMP.format(dataset_name.upper(), outfold))
        return outfold
    logging.info(f'Processing data and store at {outfold}')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}

    for mode in modes:
        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w')
        outfiles[mode].write('sentence\tlabel\n')

    with open(filename, 'r') as f:
        data = json.load(f)

    for obj in data['sentences']:
        sentence = obj['text'].strip()
        if uncased:
            sentence = sentence.lower()
        intent = obj['intent'].lower().replace(' ', '')
        label = INTENT[intent]
        txt = f'{sentence}\t{label}\n'
        if obj['training']:
            outfiles['train'].write(txt)
        else:
            outfiles['test'].write(txt)
    for mode in modes:
        outfiles[mode].close()
    return outfold