Ejemplo n.º 1
0
def make_merging_data(num_feat_files, suffix, numeric_ids):
    num_examples = 500
    num_feats_per_file = 17

    np.random.seed(1234567890)

    merge_dir = join(_my_dir, 'train', 'test_merging')
    if not exists(merge_dir):
        os.makedirs(merge_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j) if not numeric_ids else j
        x = {
            "f{:03d}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(num_feat_files * num_feats_per_file)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)

    # Unmerged
    subset_dict = {}
    for i in range(num_feat_files):
        feat_num = i * num_feats_per_file
        subset_dict['{}'.format(i)] = [
            "f{:03d}".format(feat_num + j) for j in range(num_feats_per_file)
        ]
    train_path = join(merge_dir, suffix)
    train_fs = FeatureSet('train', ids, labels=labels, features=features)
    Writer.for_path(train_path, train_fs, subsets=subset_dict).write()

    # Merged
    train_path = join(merge_dir, 'all{}'.format(suffix))
    Writer.for_path(train_path, train_fs).write()
Ejemplo n.º 2
0
def make_merging_data(num_feat_files, suffix, numeric_ids):
    num_examples = 500
    num_feats_per_file = 17

    np.random.seed(1234567890)

    merge_dir = join(_my_dir, 'train', 'test_merging')
    if not exists(merge_dir):
        os.makedirs(merge_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j) if not numeric_ids else j
        x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num
             in range(num_feat_files * num_feats_per_file)}
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)

    # Unmerged
    subset_dict = {}
    for i in range(num_feat_files):
        feat_num = i * num_feats_per_file
        subset_dict['{}'.format(i)] = ["f{:03d}".format(feat_num + j) for j in
                                       range(num_feats_per_file)]
    train_path = join(merge_dir, suffix)
    train_fs = FeatureSet('train', ids, labels=labels, features=features)
    Writer.for_path(train_path, train_fs, subsets=subset_dict).write()

    # Merged
    train_path = join(merge_dir, 'all{}'.format(suffix))
    Writer.for_path(train_path, train_fs).write()
Ejemplo n.º 3
0
def main():
    """
    Create directories and split CSV files into subsets.
    """
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'),
                        level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Create dictionary of subsets to use for creating split feature files
    subset_dict = {
        'vitals': ['Sex', 'Age'],
        'socioeconomic': ['Pclass', 'Fare'],
        'family': ['SibSp', 'Parch'],
        'misc': ['Embarked']
    }
    features_to_keep = list(chain(*subset_dict.values()))

    # Create directories to store files
    if not os.path.exists('titanic/train'):
        logger.info('Creating titanic/train directory')
        os.makedirs('titanic/train')
    if not os.path.exists('titanic/dev'):
        logger.info('Creating titanic/dev directory')
        os.makedirs('titanic/dev')
    if not os.path.exists('titanic/train+dev'):
        logger.info('Creating titanic/train+dev directory')
        os.makedirs('titanic/train+dev')
    if not os.path.exists('titanic/test'):
        logger.info('Creating titanic/test directory')
        os.makedirs('titanic/test')

    usecols_train = features_to_keep + ['PassengerId', 'Survived']
    usecols_test = features_to_keep + ['PassengerId']

    # Read and write training FeatureSet
    train_fs = Reader.for_path('titanic/train.csv',
                               label_col='Survived',
                               id_col='PassengerId',
                               drop_blanks=True,
                               pandas_kwargs={
                                   'usecols': usecols_train
                               },
                               quiet=False,
                               sparse=False).read()

    train_fs.filter(features=features_to_keep)
    num_train_dev = len(train_fs)
    num_train = int((num_train_dev / 5) * 4)
    writer = Writer.for_path('titanic/train/.csv',
                             train_fs[:num_train],
                             id_col='PassengerId',
                             label_col='Survived',
                             quiet=False,
                             subsets=subset_dict)
    writer.write()

    # Write train+dev set for training model to use to generate predictions on
    # test
    writer = Writer.for_path('titanic/train+dev/.csv',
                             train_fs,
                             label_col='Survived',
                             id_col='PassengerId',
                             quiet=False,
                             subsets=subset_dict)
    writer.write()

    # Write dev FeatureSet
    writer = Writer.for_path('titanic/dev/.csv',
                             train_fs[num_train:],
                             label_col='Survived',
                             id_col='PassengerId',
                             quiet=False,
                             subsets=subset_dict)
    writer.write()

    # Read and write test FeatureSet
    test_fs = Reader.for_path('titanic/test.csv',
                              label_col='Survived',
                              drop_blanks=True,
                              pandas_kwargs={
                                  'usecols': usecols_test
                              },
                              quiet=False,
                              sparse=False).read()

    test_fs.filter(features=features_to_keep)
    num_test = len(test_fs)
    test_fs.ids = list(range(num_train_dev + 1, num_test + num_train_dev + 1))
    writer = Writer.for_path('titanic/test/.csv',
                             test_fs,
                             id_col='PassengerId',
                             quiet=False,
                             subsets=subset_dict)
    writer.write()
Ejemplo n.º 4
0
def make_conversion_data(num_feat_files,
                         from_suffix,
                         to_suffix,
                         with_labels=True):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = join(_my_dir, 'train', 'test_conversion')
    if not exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = [] if with_labels else None
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        # if we are not using labels, we do not want zero-valued features
        # because it may be the case that some subset of features end up
        # being all 0 and if this subset ends up being written out to a file
        # below, then for some formats (e.g., megam) nothing will get written
        # out which can cause issues when reading this file
        lowest_feature_value = 0 if with_labels else 1
        x = {
            "f{:03d}".format(feat_num):
            np.random.randint(lowest_feature_value, 4 + lowest_feature_value)
            for feat_num in range(num_feat_files * num_feats_per_file)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        if with_labels:
            labels.append(y)
        features.append(x)

    # Create vectorizers/maps for libsvm subset writing
    feat_vectorizer = DictVectorizer()
    feat_vectorizer.fit(features)
    if with_labels:
        label_map = {
            label: num
            for num, label in enumerate(
                sorted({
                    label
                    for label in labels if not isinstance(label, (int, float))
                }))
        }
        # Add fake item to vectorizer for None
        label_map[None] = '00000'
    else:
        label_map = None

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # use '_unlabeled' as part of any file names when not using labels
    with_labels_part = '' if with_labels else '_unlabeled'

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = join(
            convert_dir, '{}_{}{}{}'.format(feature_name_prefix, i,
                                            with_labels_part, from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {
                "f{:03d}".format(feat_num + j):
                features[example_num]["f{:03d}".format(feat_num + j)]
                for j in range(num_feats_per_file)
            }
            sub_features.append(x)
        train_fs = FeatureSet('sub_train',
                              ids,
                              labels=labels,
                              features=sub_features,
                              vectorizer=feat_vectorizer)
        if from_suffix == '.libsvm':
            Writer.for_path(train_path, train_fs, label_map=label_map).write()
        elif from_suffix in ['.arff', '.csv', '.tsv']:
            label_col = 'y' if with_labels else None
            Writer.for_path(train_path, train_fs, label_col=label_col).write()
        else:
            Writer.for_path(train_path, train_fs).write()

    # Write out the merged features in the `to_suffix` file format
    train_path = join(
        convert_dir, '{}{}_all{}'.format(feature_name_prefix, with_labels_part,
                                         to_suffix))
    train_fs = FeatureSet('train',
                          ids,
                          labels=labels,
                          features=features,
                          vectorizer=feat_vectorizer)

    # we need to do this to get around the FeatureSet using NaNs
    # instead of None when there are no labels which causes problems
    # later when comparing featuresets
    if not with_labels:
        train_fs.labels = [None] * len(train_fs.labels)

    if to_suffix == '.libsvm':
        Writer.for_path(train_path, train_fs, label_map=label_map).write()
    elif to_suffix in ['.arff', '.csv', '.tsv']:
        label_col = 'y' if with_labels else None
        Writer.for_path(train_path, train_fs, label_col=label_col).write()
    else:
        Writer.for_path(train_path, train_fs).write()
Ejemplo n.º 5
0
def make_conversion_data(num_feat_files, from_suffix, to_suffix):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = join(_my_dir, 'train', 'test_conversion')
    if not exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num
             in range(num_feat_files * num_feats_per_file)}
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    # Create vectorizers/maps for libsvm subset writing
    feat_vectorizer = DictVectorizer()
    feat_vectorizer.fit(features)
    label_map = {label: num for num, label in
                 enumerate(sorted({label for label in labels if
                                   not isinstance(label, (int, float))}))}
    # Add fake item to vectorizer for None
    label_map[None] = '00000'

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = join(convert_dir, '{}_{}{}'.format(feature_name_prefix,
                                                        i, from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {"f{:03d}".format(feat_num + j):
                 features[example_num]["f{:03d}".format(feat_num + j)] for j in
                 range(num_feats_per_file)}
            sub_features.append(x)
        train_fs = FeatureSet('sub_train', ids, labels=labels,
                              features=sub_features,
                              vectorizer=feat_vectorizer)
        if from_suffix == '.libsvm':
            Writer.for_path(train_path, train_fs,
                            label_map=label_map).write()
        else:
            Writer.for_path(train_path, train_fs).write()

    # Write out the merged features in the `to_suffix` file format
    train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix,
                                                     to_suffix))
    train_fs = FeatureSet('train', ids, labels=labels, features=features,
                          vectorizer=feat_vectorizer)
    if to_suffix == '.libsvm':
        Writer.for_path(train_path, train_fs,
                        label_map=label_map).write()
    else:
        Writer.for_path(train_path, train_fs).write()
Ejemplo n.º 6
0
def make_conversion_data(num_feat_files, from_suffix, to_suffix):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = join(_my_dir, 'train', 'test_conversion')
    if not exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {
            "f{:03d}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(num_feat_files * num_feats_per_file)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    # Create vectorizers/maps for libsvm subset writing
    feat_vectorizer = DictVectorizer()
    feat_vectorizer.fit(features)
    label_map = {
        label: num
        for num, label in enumerate(
            sorted({
                label
                for label in labels if not isinstance(label, (int, float))
            }))
    }
    # Add fake item to vectorizer for None
    label_map[None] = '00000'

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = join(
            convert_dir, '{}_{}{}'.format(feature_name_prefix, i, from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {
                "f{:03d}".format(feat_num + j):
                features[example_num]["f{:03d}".format(feat_num + j)]
                for j in range(num_feats_per_file)
            }
            sub_features.append(x)
        train_fs = FeatureSet('sub_train',
                              ids,
                              labels=labels,
                              features=sub_features,
                              vectorizer=feat_vectorizer)
        if from_suffix == '.libsvm':
            Writer.for_path(train_path, train_fs, label_map=label_map).write()
        else:
            Writer.for_path(train_path, train_fs).write()

    # Write out the merged features in the `to_suffix` file format
    train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix,
                                                     to_suffix))
    train_fs = FeatureSet('train',
                          ids,
                          labels=labels,
                          features=features,
                          vectorizer=feat_vectorizer)
    if to_suffix == '.libsvm':
        Writer.for_path(train_path, train_fs, label_map=label_map).write()
    else:
        Writer.for_path(train_path, train_fs).write()
def make_conversion_data(num_feat_files, from_suffix, to_suffix, with_labels=True):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = join(_my_dir, 'train', 'test_conversion')
    if not exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = [] if with_labels else None
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        # if we are not using labels, we do not want zero-valued features
        # because it may be the case that some subset of features end up
        # being all 0 and if this subset ends up being written out to a file
        # below, then for some formats (e.g., megam) nothing will get written
        # out which can cause issues when reading this file
        lowest_feature_value = 0 if with_labels else 1
        x = {"f{:03d}".format(feat_num): np.random.randint(lowest_feature_value, 4 + lowest_feature_value) for feat_num
             in range(num_feat_files * num_feats_per_file)}
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        if with_labels:
            labels.append(y)
        features.append(x)

    # Create vectorizers/maps for libsvm subset writing
    feat_vectorizer = DictVectorizer()
    feat_vectorizer.fit(features)
    if with_labels:
        label_map = {label: num for num, label in
                     enumerate(sorted({label for label in labels if
                                       not isinstance(label, (int, float))}))}
        # Add fake item to vectorizer for None
        label_map[None] = '00000'
    else:
        label_map = None

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # use '_unlabeled' as part of any file names when not using labels
    with_labels_part = '' if with_labels else '_unlabeled'

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = join(convert_dir, '{}_{}{}{}'.format(feature_name_prefix,
                                                          i,
                                                          with_labels_part,
                                                          from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {"f{:03d}".format(feat_num + j):
                 features[example_num]["f{:03d}".format(feat_num + j)] for j in
                 range(num_feats_per_file)}
            sub_features.append(x)
        train_fs = FeatureSet('sub_train', ids, labels=labels,
                              features=sub_features,
                              vectorizer=feat_vectorizer)
        if from_suffix == '.libsvm':
            Writer.for_path(train_path, train_fs,
                            label_map=label_map).write()
        elif from_suffix in ['.arff', '.csv', '.tsv']:
            label_col = 'y' if with_labels else None
            Writer.for_path(train_path, train_fs, label_col=label_col).write()
        else:
            Writer.for_path(train_path, train_fs).write()

    # Write out the merged features in the `to_suffix` file format
    train_path = join(convert_dir, '{}{}_all{}'.format(feature_name_prefix,
                                                       with_labels_part,
                                                       to_suffix))
    train_fs = FeatureSet('train', ids, labels=labels, features=features,
                          vectorizer=feat_vectorizer)

    # we need to do this to get around the FeatureSet using NaNs
    # instead of None when there are no labels which causes problems
    # later when comparing featuresets
    if not with_labels:
        train_fs.labels = [None] * len(train_fs.labels)

    if to_suffix == '.libsvm':
        Writer.for_path(train_path, train_fs,
                        label_map=label_map).write()
    elif to_suffix in ['.arff', '.csv', '.tsv']:
        label_col = 'y' if with_labels else None
        Writer.for_path(train_path, train_fs, label_col=label_col).write()
    else:
        Writer.for_path(train_path, train_fs).write()