Example #1
0
def make_transforms(schema_in, rows_in, schema_out, transforms_out):
    fluent_schema = load_schema(schema_in)
    basic_schema = {}
    pre_transforms = []
    transforms = []
    builders = []
    dates = [
        feature_name
        for feature_name, fluent_type in fluent_schema.iteritems()
        if fluent_type.endswith('date')
    ]
    id_field = None
    for feature_name, fluent_type in fluent_schema.iteritems():
        # parse adjectives
        if fluent_type.startswith('optional_'):
            transform = PresenceTransform(feature_name)
            pre_transforms.append(transform)
            transforms.append(transform)
            fluent_type = fluent_type[len('optional_'):]
            feature_name = '{}.value'.format(feature_name)

        # parse nouns
        if fluent_type == 'id':
            id_field = feature_name
        elif fluent_type in ['categorical', 'unbounded_categorical']:
            transforms.append(StringTransform(feature_name, fluent_type))
        elif fluent_type == 'percent':
            transforms.append(PercentTransform(feature_name))
        elif fluent_type == 'sparse_real':
            transforms.append(SparseRealTransform(feature_name))
        elif fluent_type == 'text':
            builders.append(TextTransformBuilder(feature_name))
        elif fluent_type == 'tags':
            builders.append(
                TextTransformBuilder(feature_name, allow_empty=True))
        elif fluent_type == 'date':
            relatives = [other for other in dates if other < feature_name]
            transforms.append(DateTransform(feature_name, relatives))
        else:
            basic_type = FLUENT_TO_BASIC[fluent_type]
            basic_schema[feature_name] = basic_type
    if builders:
        transforms += build_transforms(rows_in, pre_transforms, builders)
    for transform in transforms:
        basic_schema.update(transform.get_schema())
    json_dump(basic_schema, schema_out)
    pickle_dump(transforms, transforms_out)
    LOG('transformed {} -> {} features'.format(
        len(fluent_schema),
        len(basic_schema)))
    return id_field
Example #2
0
def make_transforms(schema_in, rows_in, schema_out, transforms_out):
    fluent_schema = load_schema(schema_in)
    basic_schema = {}
    pre_transforms = []
    transforms = []
    builders = []
    dates = [
        feature_name
        for feature_name, fluent_type in fluent_schema.iteritems()
        if fluent_type.endswith('date')
    ]
    id_field = None
    for feature_name, fluent_type in fluent_schema.iteritems():
        # parse adjectives
        if fluent_type.startswith('optional_'):
            transform = PresenceTransform(feature_name)
            pre_transforms.append(transform)
            transforms.append(transform)
            fluent_type = fluent_type[len('optional_'):]
            feature_name = '{}.value'.format(feature_name)

        # parse nouns
        if fluent_type == 'id':
            id_field = feature_name
        elif fluent_type in ['categorical', 'unbounded_categorical']:
            transforms.append(StringTransform(feature_name, fluent_type))
        elif fluent_type == 'percent':
            transforms.append(PercentTransform(feature_name))
        elif fluent_type == 'sparse_real':
            transforms.append(SparseRealTransform(feature_name))
        elif fluent_type == 'text':
            builders.append(TextTransformBuilder(feature_name))
        elif fluent_type == 'tags':
            builders.append(
                TextTransformBuilder(feature_name, allow_empty=True))
        elif fluent_type == 'date':
            relatives = [other for other in dates if other < feature_name]
            transforms.append(DateTransform(feature_name, relatives))
        else:
            basic_type = FLUENT_TO_BASIC[fluent_type]
            basic_schema[feature_name] = basic_type
    if builders:
        transforms += build_transforms(rows_in, pre_transforms, builders)
    for transform in transforms:
        basic_schema.update(transform.get_schema())
    json_dump(basic_schema, schema_out)
    pickle_dump(transforms, transforms_out)
    LOG('transformed {} -> {} features'.format(
        len(fluent_schema),
        len(basic_schema)))
    return id_field
Example #3
0
def make_fake_transforms(transforms_out):
    pickle_dump([], transforms_out)
Example #4
0
def make_fake_transforms(transforms_out):
    pickle_dump([], transforms_out)