Beispiel #1
0
def create_datasets(path_to_data_folders):
    data_root_folder = Path(path_to_data_folders)
    ref_train_path = data_root_folder / 'train.csv'
    ref_train_dataset = read_csv(ref_train_path)
    for t in data_root_folder.iterdir():  # iterate over train size
        if not t.is_dir():  # ignore reference train
            continue
        train_size = int(str(t).split('_')[-1])
        LOGGER.info("Processing datasets of size: %s" % train_size)
        for s in t.iterdir():  # iterate over seeds
            if not s.is_dir():  # ignore pkl
                continue
            LOGGER.info("Processing seed: %s" % str(s.name))
            # validate
            val_csv_data = read_csv(s / 'validate.csv')
            val_intents, val_entities = extract_intents_entities(
                val_csv_data, ENTITY_MAPPING)
            val_dataset = make_dataset_dict(val_intents, val_entities)
            dump_json(val_dataset, s / 'validate.json'.format(train_size))

            # train
            tr_csv_data = read_csv(s / 'train_{}.csv'.format(train_size))
            train_intents, train_entities = extract_intents_entities(
                tr_csv_data, ENTITY_MAPPING)
            enriched_train_entities = merge_entity_dict(
                train_entities, val_entities)
            train_dataset = make_dataset_dict(train_intents,
                                              enriched_train_entities)
            dump_json(train_dataset, s / 'train_{}.json'.format(train_size))

            # augmented
            csv_data = read_csv(s / 'train_{}_aug_2000.csv'.format(train_size))
            augmented_utterances = csv_data[-2000:]

            process_and_dump_augmentation(
                current_path=s,
                train_data=tr_csv_data,
                train_entities=enriched_train_entities,
                augmentation_data=augmented_utterances,
                ref_data=ref_train_dataset,
                augmentation_ratio=0.5,
                train_size=train_size)

            process_and_dump_augmentation(
                current_path=s,
                train_data=tr_csv_data,
                train_entities=enriched_train_entities,
                augmentation_data=augmented_utterances,
                ref_data=ref_train_dataset,
                augmentation_ratio=1,
                train_size=train_size)
Beispiel #2
0
    def add_nones(self,
                  sentences,
                  none_folder,
                  none_size=None,
                  none_intents=None,
                  pseudolabels=None,
                  none_idx=None):
        none_path = none_folder / 'train.csv'
        none_sentences = read_csv(none_path)

        if none_intents is not None:
            none_sentences = self.filter_intents(none_sentences, none_intents)

        random.shuffle(none_sentences)
        for row in none_sentences[:none_size]:
            if 'snips' in str(none_folder):  # if the none class is also on
                # the snips format
                new_row = row
                if pseudolabels is not None:
                    new_row[3] = pseudolabels[row[3]]
                else:
                    new_row[3] = 'None'
            else:
                utterance = row[none_idx]
                new_row = [
                    utterance, 'O ' * len(word_tokenize(utterance)), utterance,
                    'None'
                ]
                if pseudolabels is not None:
                    new_row[3] = pseudolabels[row[3]]
            sentences.append(new_row)
        return sentences
 def add_nones(sentences, none_folder, none_idx, none_size):
     none_path = none_folder / 'train.csv'
     none_sentences = read_csv(none_path)
     random.shuffle(none_sentences)
     for row in none_sentences[:none_size]:
         utterance = row[none_idx]
         new_row = ["", "", "", "None", "", utterance, "", "", ""]
         sentences.append(new_row)
     return sentences
def save_augmented_dataset(generated_sentences, n_generated, train_path,
                           output_dir):
    dataset = read_csv(train_path)
    for s, l, d, i in zip(generated_sentences['utterances'],
                          generated_sentences['labellings'],
                          generated_sentences['delexicalised'],
                          generated_sentences['intents']):
        dataset.append([s, l, d, i])
    augmented_path = output_dir / Path(
        train_path.name.replace('.csv', '_aug_{}.csv'.format(n_generated)))
    write_csv(dataset, augmented_path)
Beispiel #5
0
def csv2json(csv_path_in, output_dir):
    print('Starting csv2json conversion...')
    jsondic = {'language': 'en'}
    csv_data = read_csv(Path(csv_path_in))
    intents, entities = extract_intents_entities(csv_data)

    jsondic['intents'] = intents
    jsondic['entities'] = entities

    filename = str(Path(csv_path_in).stem) + '.json'
    path_out = Path(output_dir) / filename
    dump_json(jsondic, path_out)

    print('Successfully converted csv2json !')
Beispiel #6
0
def embed_dataset(dataset_path, infersent_path, force_cpu=False):
    """
    To make this work, first run ./get_infersent.sh
    """
    MODEL_PATH = infersent_path / "encoder/infersent1.pkl"
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': 1}
    model = InferSent(params_model)
    if force_cpu:
        model.load_state_dict(torch.load(MODEL_PATH, map_location='cpu'))
    else:
        model.load_state_dict(torch.load(MODEL_PATH))
        model.cuda()

    W2V_PATH = infersent_path / 'GloVe/glove.840B.300d.txt'
    model.set_w2v_path(W2V_PATH)
    model.build_vocab_k_words(K=100000)

    csv_data = read_csv(dataset_path / 'train.csv')
    csv_data = csv_data[1:]  # skip header
    data = defaultdict(list)

    for irow, row in enumerate(csv_data):
        if 'snips' in str(dataset_path):
            utterance, labels, delexicalised, intent = row
        else:
            raise TypeError(
                "Unknown dataset type. Implement your own first. See the "
                "README")
        data[intent].append(utterance)

    vectors = {}
    for i, (intent, sentences) in enumerate(data.items()):
        print('{}/{} done'.format(i, len(data.items())))
        embeddings = model.encode(sentences)
        avg_embedding = np.mean(embeddings, axis=0)
        vectors[intent] = avg_embedding

    return vectors
 def add_nones(sentences, none_folder, none_idx, none_size):
     none_path = none_folder / 'train.csv'
     none_sentences = read_csv(none_path)
     random.shuffle(none_sentences)
     new_rows = [[row[none_idx]] for row in none_sentences[:none_size]]
     return sentences + new_rows
Beispiel #8
0
    def build_data_files(self,
                         dataset_folder,
                         dataset_size=None,
                         restrict_intents=None,
                         none_folder=None,
                         none_size=None,
                         none_intents=None,
                         none_idx=None,
                         infersent_selection="no_infersent_selection",
                         cosine_threshold=None,
                         output_folder=None,
                         skip_header=True):

        original_train_path = dataset_folder / 'train.csv'
        original_test_path = dataset_folder / 'validate.csv'

        new_train = read_csv(original_train_path)
        new_test = read_csv(original_test_path)

        if skip_header:
            header_train = new_train[0]
            header_test = new_test[0]
            new_train = new_train[1:]
            new_test = new_test[1:]

        # filter intents
        filter_prefix = ''
        if restrict_intents is not None:
            filter_prefix = '_filtered'
            new_train = self.filter_intents(new_train, restrict_intents)
            new_test = self.filter_intents(new_test, restrict_intents)

        # trim_dataset
        trim_prefix = ''
        if dataset_size is not None:
            trim_prefix = '_{}'.format(dataset_size)
            original_dataset_size = len(new_train)
            keep_fraction = dataset_size / original_dataset_size
            intents = self.get_intents(new_train)
            sss = StratifiedShuffleSplit(n_splits=1,
                                         test_size=1 - keep_fraction)
            keep_indices = list(sss.split(intents, intents))[0][0]
            new_train = [new_train[i] for i in keep_indices]
            # new_train = random.sample(new_train, dataset_size)

        # add nones
        train_none_prefix = ''
        test_none_prefix = ''
        if none_size is not None:
            train_none_prefix = '_none_{}'.format(none_size)
            test_none_prefix = '_with_none'
            pseudolabels = None
            if infersent_selection != NO_INFERSENT_SELECTION:
                assert (none_intents is None)
                none_intents, pseudolabels = self.select_none_intents(
                    new_train, restrict_intents, none_folder, cosine_threshold)
                if infersent_selection == 'unsupervised':
                    pseudolabels = None  # ignore pseudolabels
            new_train = self.add_nones(new_train,
                                       none_folder,
                                       none_size=none_size,
                                       none_intents=none_intents,
                                       pseudolabels=pseudolabels,
                                       none_idx=none_idx)
            new_test = self.add_nones(new_test,
                                      none_folder,
                                      none_size=200,
                                      none_intents=none_intents,
                                      none_idx=none_idx)

        if output_folder is not None:
            new_train_path = output_folder / 'train{}{}{}.csv'.format(
                trim_prefix, train_none_prefix, filter_prefix)
            new_test_path = output_folder / 'validate{}{}.csv'.format(
                test_none_prefix, filter_prefix)
        else:
            new_train_path = dataset_folder / 'train{}{}{}.csv'.format(
                trim_prefix, train_none_prefix, filter_prefix)
            new_test_path = dataset_folder / 'validate{}{}.csv'.format(
                test_none_prefix, filter_prefix)

        if skip_header:
            new_train = [header_train] + new_train
            new_test = [header_test] + new_test

        write_csv(new_test, new_test_path)
        write_csv(new_train, new_train_path)

        return new_train_path, new_test_path
Beispiel #9
0
from automatic_data_generation.utils.io import read_csv, write_csv
from pathlib import Path
from sklearn.model_selection import StratifiedShuffleSplit

data = read_csv(Path('data.csv'))
header = [data[0]]
data = data[1:]

intents = [row[3] for row in data]

test_fraction = 0.2
test_size = int(test_fraction * len(data))

sss = StratifiedShuffleSplit(n_splits=1,
                             test_size=test_size)

train_indices, test_indices = list(sss.split(intents, intents))[0]

train = header + [data[i] for i in train_indices]
validate = header + [data[i] for i in test_indices]

write_csv(validate, Path('validate.csv'))
write_csv(train, Path('train.csv'))