def main():
    dataset_dir = FLAGS.dataset_dir
    tmp_dir = FLAGS.tmp_dir
    output_dir = FLAGS.output_dir
    info_df = get_speech_commands_info_df(dataset_dir, from_scratch=False)
    # classes_list = ['zero', 'one', 'two', 'three', 'four',
    #                 'five', 'six', 'seven', 'eight', 'nine']
    classes_list = [
        'yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go'
    ]
    proba_keep = 1
    shuffled = True
    processed_df = get_processed_df(info_df,
                                    classes=classes_list,
                                    proba_keep=proba_keep,
                                    shuffled=shuffled)
    features_labels_pairs_train =\
      get_features_labels_pairs_generator(processed_df, subset='train')
    features_labels_pairs_test =\
      get_features_labels_pairs_generator(processed_df, subset='test')

    row_count = 1
    col_count = 1
    num_examples_train = 21115
    num_examples_test = 2567
    output_dim = len(classes_list)
    dataset_name = 'Speech Commands'
    new_dataset_name = 'starcraft'
    sequence_size = 16000
    dataset_formatter = UniMediaDatasetFormatter(
        dataset_name,
        output_dir,
        features_labels_pairs_train,
        features_labels_pairs_test,
        output_dim,
        col_count,
        row_count,
        sequence_size=sequence_size,
        num_examples_train=num_examples_train,
        num_examples_test=num_examples_test,
        is_sequence_col='false',
        is_sequence_row='false',
        has_locality_col='true',
        has_locality_row='true',
        format='DENSE',
        is_sequence='false',
        sequence_size_func=max,
        new_dataset_name=new_dataset_name,
        classes_list=classes_list)

    dataset_formatter.press_a_button_and_give_me_an_AutoDL_dataset()
Ejemplo n.º 2
0
def main():
    output_dir = FLAGS.output_dir
    classes = [
        'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
        'nine'
    ]

    row_count = 28
    col_count = 28
    output_dim = len(classes)
    dataset_name = 'MNIST'
    new_dataset_name = 'munster'
    classes_dict = {s: i for i, s in enumerate(classes)}
    features_labels_pairs_train =\
        get_features_labels_pairs_generator(subset='train')
    features_labels_pairs_test =\
        get_features_labels_pairs_generator(subset='test')
    dataset_formatter = UniMediaDatasetFormatter(
        dataset_name,
        output_dir,
        features_labels_pairs_train,
        features_labels_pairs_test,
        output_dim,
        col_count,
        row_count,
        sequence_size=None,  # for strides=2
        num_examples_train=None,
        num_examples_test=None,
        is_sequence_col='false',
        is_sequence_row='false',
        has_locality_col='true',
        has_locality_row='true',
        format='DENSE',
        is_sequence='false',
        sequence_size_func=max,
        new_dataset_name=new_dataset_name,
        classes_dict=classes_dict)

    dataset_formatter.press_a_button_and_give_me_an_AutoDL_dataset()
        new_dataset_name = 'chuck'
        num_channels = 3

    features_labels_pairs_train =\
        get_features_labels_pairs_generator(subset='train')
    features_labels_pairs_test =\
        get_features_labels_pairs_generator(subset='test')
    dataset_formatter = UniMediaDatasetFormatter(
        dataset_name,
        output_dir,
        features_labels_pairs_train,
        features_labels_pairs_test,
        output_dim,
        col_count,
        row_count,
        sequence_size=sequence_size,  # for strides=2
        num_channels=num_channels,
        num_examples_train=num_examples_train,
        num_examples_test=num_examples_test,
        is_sequence_col='false',
        is_sequence_row='false',
        has_locality_col='true',
        has_locality_row='true',
        format='DENSE',
        is_sequence='false',
        sequence_size_func=max,
        new_dataset_name=new_dataset_name,
        classes_list=classes_list)

    dataset_formatter.press_a_button_and_give_me_an_AutoDL_dataset()
Ejemplo n.º 4
0
def format_data(input_dir,
                output_dir,
                new_dataset_name,
                train_size=0.8,
                max_num_examples=None,
                num_channels=3,
                classes_list=None):
    print(input_dir)
    input_dir = os.path.normpath(input_dir)
    dataset_name = os.path.basename(input_dir)
    print('Some files in input directory:')
    print(os.listdir(input_dir)[:10])
    print()
    labels_df = get_labels_df(input_dir)
    merged_df = get_merged_df(labels_df, train_size=train_size)

    if max_num_examples and max_num_examples <= 4:  # if quick check, it'll be the number of examples to format for each class
        # Need at least one example of each class (tensorflow)
        #merged_df = merged_df.sample(n=max_num_examples)
        if 'LabelConfidencePairs' in list(merged_df):
            merged_df = merged_df.groupby('LabelConfidencePairs').apply(
                lambda x: x.sample(n=1))
        elif 'Labels' in list(merged_df):
            merged_df = merged_df.groupby('Labels').apply(
                lambda x: x.sample(n=1))
        else:
            raise Exception('No labels found, please check labels.csv file.')

    all_classes = get_all_classes(merged_df)

    features_labels_pairs_train =\
      get_features_labels_pairs(merged_df, input_dir, subset='train', num_channels=num_channels)
    features_labels_pairs_test =\
      get_features_labels_pairs(merged_df, input_dir, subset='test', num_channels=num_channels)

    output_dim = len(all_classes)
    num_examples_train = merged_df[merged_df['subset'] == 'train'].shape[0]
    num_examples_test = merged_df[merged_df['subset'] == 'test'].shape[0]

    filenames = labels_df['FileName']
    row_count, col_count = im_size(input_dir, filenames)
    sequence_size = seq_size(input_dir, filenames)

    dataset_formatter = UniMediaDatasetFormatter(
        dataset_name,
        output_dir,
        features_labels_pairs_train,
        features_labels_pairs_test,
        output_dim,
        col_count,
        row_count,
        sequence_size=sequence_size,  # for strides=2
        num_channels=num_channels,
        num_examples_train=num_examples_train,
        num_examples_test=num_examples_test,
        is_sequence_col='false',
        is_sequence_row='false',
        has_locality_col='true',
        has_locality_row='true',
        format='COMPRESSED',
        is_sequence='false',
        sequence_size_func=None,
        new_dataset_name=new_dataset_name,
        classes_list=classes_list)

    dataset_formatter.press_a_button_and_give_me_an_AutoDL_dataset()
Ejemplo n.º 5
0
def format_data(input_dir,
                output_dir,
                fake_name,
                effective_sample_num,
                train_size=0.65,
                num_channels=3,
                classes_list=None,
                domain='image',
                output_dim=None,
                input_name=None):
    """ Transform data into TFRecords
    """
    print('format_data: Formatting... {} samples'.format(effective_sample_num))
    if effective_sample_num != 0:
        if domain == 'image':
            format_image.format_data(input_dir,
                                     output_dir,
                                     fake_name,
                                     train_size=0,
                                     max_num_examples=effective_sample_num,
                                     num_channels=num_channels,
                                     classes_list=classes_list,
                                     output_dim=output_dim)
        elif domain == 'video':
            format_video.format_data(input_dir,
                                     output_dir,
                                     fake_name,
                                     train_size=0,
                                     max_num_examples=effective_sample_num,
                                     num_channels=num_channels,
                                     classes_list=classes_list,
                                     output_dim=output_dim)
        elif domain == 'series':
            format_series.format_data(input_dir,
                                      output_dir,
                                      fake_name,
                                      train_size=0,
                                      max_num_examples=effective_sample_num,
                                      num_channels=num_channels,
                                      classes_list=classes_list,
                                      output_dim=output_dim)

        elif domain == 'tabular':
            D = DataManager(input_name,
                            input_dir,
                            replace_missing=False,
                            verbose=verbose)
            new_dataset_name = "unlabelled"

            if not os.path.isdir(output_dir):
                os.mkdir(output_dir)

            dataset_dir = os.path.join(output_dir, new_dataset_name)

            if not os.path.isdir(dataset_dir):
                os.mkdir(dataset_dir)

            # Format test set
            set_type = 'test'

            filepath = os.path.join(dataset_dir, "sample-unlabelled.tfrecord")
            metadata, features, labels = format_tabular._prepare_metadata_features_and_labels(
                D, set_type=set_type)
            format_tabular.convert_vectors_to_sequence_example(
                filepath,
                metadata,
                features,
                labels,
                D.info,
                max_num_examples=effective_sample_num)
        elif domain == 'text':
            name = fake_name

            language = nlp_to_tfrecords.get_language(
                os.path.join(input_dir, 'meta.json'))
            train_data = nlp_to_tfrecords.read_file(
                os.path.join(input_dir, 'train.data'))
            train_solution = nlp_to_tfrecords.read_file(
                os.path.join(input_dir, 'train.solution'))
            test_data = nlp_to_tfrecords.read_file(
                os.path.join(input_dir, 'test.data'))
            test_solution = nlp_to_tfrecords.read_file(
                os.path.join(input_dir, 'test.solution'))

            # Create vocabulary
            vocabulary = nlp_to_tfrecords.create_vocabulary(
                train_data + test_data, language)

            # Convert data into sequences of integers
            features_labels_pairs_train = nlp_to_tfrecords.get_features_labels_pairs(
                train_data,
                train_solution,
                vocabulary,
                language,
                format=format)
            features_labels_pairs_test = nlp_to_tfrecords.get_features_labels_pairs(
                test_data, test_solution, vocabulary, language, format=format)

            # Write data in TFRecords and vocabulary in metadata
            output_dim = nlp_to_tfrecords.get_output_dim(train_solution)
            col_count, row_count = 1, 1
            sequence_size = -1
            num_channels = 1  #len(vocabulary)
            num_examples_train = len(train_data)
            num_examples_test = len(test_data)
            new_dataset_name = name  # same name
            classes_list = None
            dataset_formatter = UniMediaDatasetFormatter(
                name,
                output_dir,
                features_labels_pairs_train,
                features_labels_pairs_test,
                output_dim,
                col_count,
                row_count,
                sequence_size=sequence_size,  # for strides=2
                num_channels=num_channels,
                num_examples_train=num_examples_train,
                num_examples_test=num_examples_test,
                is_sequence_col='false',
                is_sequence_row='false',
                has_locality_col='true',
                has_locality_row='true',
                format='DENSE',
                label_format='DENSE',
                is_sequence='false',
                sequence_size_func=None,
                new_dataset_name=new_dataset_name,
                classes_list=classes_list,
                channels_dict=vocabulary)
            dataset_formatter.press_a_button_and_give_me_an_AutoDL_dataset()

        print('format_data: done.')
Ejemplo n.º 6
0
    features_labels_pairs_test =\
      get_features_labels_pairs(text_labels_pairs_test)

    print(features_labels_pairs_train)
    print(features_labels_pairs_test)

    col_count = EMBEDDING_DIMENSION
    row_count = 1
    dataset_formatter = UniMediaDatasetFormatter(
        dataset_name,
        output_dir,
        features_labels_pairs_train,
        features_labels_pairs_test,
        output_dim,
        col_count,
        row_count,
        sequence_size=None,
        is_sequence_col='false',
        is_sequence_row='false',
        has_locality_col='false',
        has_locality_row='false',
        format='DENSE',
        is_sequence='false',
        new_dataset_name=new_dataset_name)
    print("Begin formatting dataset: {}.".format(dataset_name))
    print("Basic dataset info:")
    dataset_info = dataset_formatter.__dict__.copy()
    dataset_info.pop('features_labels_pairs_train', None)
    dataset_info.pop('features_labels_pairs_test', None)
    pprint(dataset_info)
    dataset_formatter.press_a_button_and_give_me_an_AutoDL_dataset()
Ejemplo n.º 7
0
def format_data(input_dir,
                output_dir,
                fake_name,
                effective_sample_num,
                train_size=0.65,
                num_channels=3,
                classes_list=None,
                domain='image',
                quick_check=False):
    """ Transform data into TFRecords
    """
    print('format_data: Formatting... {} samples'.format(effective_sample_num))
    if effective_sample_num != 0:
        if domain == 'image':
            format_image.format_data(input_dir,
                                     output_dir,
                                     fake_name,
                                     train_size=train_size,
                                     max_num_examples=effective_sample_num,
                                     num_channels=num_channels,
                                     classes_list=classes_list,
                                     quick_check=quick_check)
        elif domain == 'video':
            format_video.format_data(input_dir,
                                     output_dir,
                                     fake_name,
                                     train_size=train_size,
                                     max_num_examples=effective_sample_num,
                                     num_channels=num_channels,
                                     classes_list=classes_list,
                                     quick_check=quick_check)
        elif domain == 'series':
            format_series.format_data(input_dir,
                                      output_dir,
                                      fake_name,
                                      train_size=train_size,
                                      max_num_examples=effective_sample_num,
                                      num_channels=num_channels,
                                      classes_list=classes_list,
                                      quick_check=quick_check)
        elif domain == 'tabular':
            max_num_examples_train = int(effective_sample_num * train_size)
            max_num_examples_test = effective_sample_num - max_num_examples_train
            num_shards_train = 1
            num_shards_test = 1
            print(fake_name)
            format_tabular.press_a_button_and_give_me_an_AutoDL_dataset(
                input_dir,
                fake_name,
                output_dir,
                None,
                None,
                num_shards_train,
                num_shards_test,
                new_dataset_name=fake_name)

        elif domain == 'text':
            name = fake_name

            language = nlp_to_tfrecords.get_language(
                os.path.join(input_dir, name + '.data', 'meta.json'))
            train_data = nlp_to_tfrecords.read_file(
                os.path.join(input_dir, name + '.data', 'train.data'))
            train_solution = nlp_to_tfrecords.read_file(
                os.path.join(input_dir, name + '.data', 'train.solution'))
            test_data = nlp_to_tfrecords.read_file(
                os.path.join(input_dir, name + '.data', 'test.data'))
            test_solution = nlp_to_tfrecords.read_file(
                os.path.join(input_dir, name + '.solution'))

            # Create vocabulary
            vocabulary = nlp_to_tfrecords.create_vocabulary(
                train_data + test_data, language)

            # Convert data into sequences of integers
            features_labels_pairs_train = nlp_to_tfrecords.get_features_labels_pairs(
                train_data,
                train_solution,
                vocabulary,
                language,
                format=format)
            features_labels_pairs_test = nlp_to_tfrecords.get_features_labels_pairs(
                test_data, test_solution, vocabulary, language, format=format)

            # Write data in TFRecords and vocabulary in metadata
            output_dim = nlp_to_tfrecords.get_output_dim(train_solution)
            col_count, row_count = 1, 1
            sequence_size = -1
            num_channels = 1  #len(vocabulary)
            num_examples_train = len(train_data)
            num_examples_test = len(test_data)
            new_dataset_name = name  # same name
            classes_list = None
            dataset_formatter = UniMediaDatasetFormatter(
                name,
                output_dir,
                features_labels_pairs_train,
                features_labels_pairs_test,
                output_dim,
                col_count,
                row_count,
                sequence_size=sequence_size,  # for strides=2
                num_channels=num_channels,
                num_examples_train=num_examples_train,
                num_examples_test=num_examples_test,
                is_sequence_col='false',
                is_sequence_row='false',
                has_locality_col='true',
                has_locality_row='true',
                format='DENSE',
                label_format='DENSE',
                is_sequence='false',
                sequence_size_func=None,
                new_dataset_name=new_dataset_name,
                classes_list=classes_list,
                channels_dict=vocabulary)
            dataset_formatter.press_a_button_and_give_me_an_AutoDL_dataset()
        else:
            raise Exception('Unknown domain: {}'.format(domain))
    print('format_data: done.')