def make_transforms(schema_in, rows_in, schema_out, transforms_out): fluent_schema = load_schema(schema_in) basic_schema = {} pre_transforms = [] transforms = [] builders = [] dates = [ feature_name for feature_name, fluent_type in fluent_schema.iteritems() if fluent_type.endswith('date') ] id_field = None for feature_name, fluent_type in fluent_schema.iteritems(): # parse adjectives if fluent_type.startswith('optional_'): transform = PresenceTransform(feature_name) pre_transforms.append(transform) transforms.append(transform) fluent_type = fluent_type[len('optional_'):] feature_name = '{}.value'.format(feature_name) # parse nouns if fluent_type == 'id': id_field = feature_name elif fluent_type in ['categorical', 'unbounded_categorical']: transforms.append(StringTransform(feature_name, fluent_type)) elif fluent_type == 'percent': transforms.append(PercentTransform(feature_name)) elif fluent_type == 'sparse_real': transforms.append(SparseRealTransform(feature_name)) elif fluent_type == 'text': builders.append(TextTransformBuilder(feature_name)) elif fluent_type == 'tags': builders.append( TextTransformBuilder(feature_name, allow_empty=True)) elif fluent_type == 'date': relatives = [other for other in dates if other < feature_name] transforms.append(DateTransform(feature_name, relatives)) else: basic_type = FLUENT_TO_BASIC[fluent_type] basic_schema[feature_name] = basic_type if builders: transforms += build_transforms(rows_in, pre_transforms, builders) for transform in transforms: basic_schema.update(transform.get_schema()) json_dump(basic_schema, schema_out) pickle_dump(transforms, transforms_out) LOG('transformed {} -> {} features'.format( len(fluent_schema), len(basic_schema))) return id_field
def make_fake_transforms(transforms_out): pickle_dump([], transforms_out)