class Word2Vec(Model): sentences = CollectionField(0, serialize=False) size = ModelParameter(1, default=100) alpha = ModelParameter(2, default=0.025) window = ModelParameter(3, default=5) min_count = ModelParameter(4, default=5) max_vocab_size = ModelParameter(5, default=None) sample = ModelParameter(6, default=0.001) seed = ModelParameter(7, default=1) workers = PrimitiveField(8, default=3, serialize=False) train_iterator = PrimitiveField( default=None, help= 'When sentences is not a rewindable iterator, you must specify another copy of it here', serialize=False) def apply(self, runner): kwargs = self.to_kwargs() model = GensimWord2Vec(**kwargs) if self.train_iterator is None: self.train_iterator = self.sentences model.build_vocab(self.sentences) model.train(self.train_iterator) return model
class RenameField(FilteredStorageRefactor): source = PrimitiveField(1) target = PrimitiveField(2) def transformation(self, doc): if self.matches(doc): doc = doc.copy() doc[self.target] = doc.pop(self.source) return doc
class AddField(FilteredStorageRefactor): field_name = PrimitiveField(1) default_value = PrimitiveField(2) def transformation(self, doc): if self.matches(doc): doc = doc.copy() doc[self.field_name] = self.default_value return doc
class as_operation(GenericDecorator): """ Creates an operation from a callable :param out_type: Base class of the operation to be built. Defaults to `Operation` :param out_name: Name of the class to be built, deafults to the decorated function name. """ out_type = PrimitiveField(default=Operation) out_name = PrimitiveField(default=None) cache_on = SpecField(default=None) args_specifications = KwargsField() def create_decorated(self, to_wrap, func_to_execute, f_spec=None, first_arg=None): f_spec = f_spec or inspect.getargspec(to_wrap) OperationClass = operation_from_func( to_wrap=to_wrap, func_to_execute=func_to_execute, out_type=self.out_type, out_name=self.out_name, args_specifications=self.args_specifications, f_spec=f_spec, method_type=self.method_type, first_arg=first_arg, cache_on=self.cache_on) return OperationClass @staticmethod def get_current_operation(): """ Should be called inside a function decorated with as_operation """ # f_back brings you to the calling function, f_back brings you to the apply method of the # dynamically created operation frame = inspect.currentframe() try: res = frame.f_back.f_back.f_locals['self'] if not isinstance(res, Operation): raise RuntimeError( "This function should be called inside an operation created with the as_operation decorator" ) return res finally: # Avoid reference cycle del frame
class ProjectedRefactor(StorageRefactor): """ This class handles a different semantic for for storage_refactor field It only propagates on doc[field] """ field = PrimitiveField(0) def chain_transformations(self, doc): return self.transformation(doc) def transformation(self, doc): # Everything should be able to receive anything. I don't like that if not isinstance(doc, dict): return doc doc = doc.copy() # this could be avoided, I prefer code clarity at this stage subfields = self.field.split('.') assert len(subfields) >= 1 subdoc = doc for field in subfields[:-1]: if field not in subdoc: return doc else: subdoc = subdoc[field] last_field = subfields[-1] if last_field in subdoc: subdoc[last_field] = self.storage_refactor.chain_transformations(subdoc[last_field]) return doc
class RemoveField(FilteredStorageRefactor): field_name = PrimitiveField(1) def transformation(self, doc): if self.matches(doc): doc = doc.copy() doc.pop(self.field_name, None) return doc
class GenericDecorator(Spec): """ Abstracts all the boilerplate required to build a decorator that works on functions, instance methods and class methods :param method_type: if is None, the decorated function is assumed to be a function, otherwise it is assumed to be a method. If method_type == 'instance' the function is assumed to be an instance method otherwise a classmethod """ method_type = PrimitiveField(0, default=None) def __get__(self, instance, owner): if (instance is None and self.method_type == 'instance') or \ (owner is None and self.method_type == 'class'): return self first_arg = instance if self.method_type == 'instance' else owner @wraps(self.func) def new_f(*args, **kwargs): return self.func(first_arg, *args, **kwargs) cls = type(instance) if instance is not None else owner assert cls is not None return self.create_decorated(self.func, new_f, inspect.getargspec(self.func), first_arg=first_arg) def __call__(self, func): if self.method_type: self.func = func return self else: return self.create_decorated(func, func) def create_decorated(self, to_wrap, func_to_execute, f_spec=None, first_arg=None): """ Abstract method that should be implemented in order to build a decorator The difference between `to_wrap` and `func_to_execute` is the fact that in the case of instance methods and class methods, `func_to_execute` has the first argument already binded. If `to_wrap` is just a function, then `to_wrap == func_to_execute` :param to_wrap: Original wrapped function :param func_to_execute: You should execute this function :param f_spec: The argspec of the function to be decorated, if None, it should be computed from to_wrap (TODO: remove this argument) :param first_arg: `self` if it's an instance method, `cls` if it's a classmethod, None otherwise """ raise NotImplementedError()
class ChangeType(FilteredStorageRefactor): new_type = PrimitiveField(1) def __init__(self, *args, **kwargs): super(ChangeType, self).__init__(*args, **kwargs) assert issubclass(self.new_type, Spec) def transformation(self, doc): if self.matches(doc): doc = doc.copy() doc['type'] = get_import_path(self.new_type) return doc
class FilteredStorageRefactor(StorageRefactor): field_type = PrimitiveField(0) def matches(self, doc): return isinstance( doc, dict) and doc['type'] == self.get_field_type_string() def get_field_type_string(self): if isinstance(self.field_type, basestring): return self.field_type else: return get_import_path(self.field_type)
class FilteredStorageRefactor(StorageRefactor): spec_type = PrimitiveField(0) def matches(self, doc): # Everything should be able to receive anything. I don't like that return isinstance(doc, dict) and 'type' in doc and doc['type'] == self.get_spec_type_string() def get_spec_type_string(self): if isinstance(self.spec_type, basestring): return self.spec_type else: return get_import_path(self.spec_type)
class AutosavedFunction(as_operation): cache_on = PrimitiveField() # make cache_on a required parameter def create_decorated(self, to_wrap, func_to_execute, f_spec=None, first_arg=None): OperationClass = super(AutosavedFunction, self).create_decorated(to_wrap, func_to_execute, f_spec=f_spec, first_arg=first_arg) class AutosavedOperation(OperationClass): def to_dict(self, include_all=False): res = super(AutosavedOperation, self).to_dict(include_all=include_all) if first_arg is not None: res['type'] = get_import_path(first_arg, func_to_execute.__name__, 'operation_class') else: res['type'] = get_import_path(func_to_execute, 'operation_class') return res def __repr__(self): d = self.to_dict(include_all=True) d.pop('type') return OperationClass(**d).__repr__() class FunctionWrapper(object): @property def wrapped_function(self): return to_wrap @property def operation_class(self): return AutosavedOperation @wraps(to_wrap) def __call__(_, *args, **kwargs): force = kwargs.pop('force_run', False) return self.cache_on.execute(AutosavedOperation( *args, **kwargs), force=force) return FunctionWrapper()
class ChangeField(FilteredStorageRefactor): field_name = PrimitiveField(1) old_value = PrimitiveField(2) new_value = PrimitiveField(3) def matches(self, doc): return \ ( super(ChangeField, self).matches(doc) and self.field_name in doc and doc[self.field_name] == self.old_value ) def transformation(self, doc): if self.matches(doc): doc = doc.copy() doc[self.field_name] = self.new_value return doc @property def recurse_first(self): # we want to apply this transformation last return self.field_name == 'type'
class MemoryObject(Operation): obj = PrimitiveField(0) def apply(self, runner): return self.obj def to_dict(self, include_all=False): res = super(MemoryObject, self).to_dict(include_all=include_all) res['obj'] = id(self.obj) return res @classmethod def _from_dict(cls, kwargs, path=None): res = super(MemoryObject, cls)._from_dict(kwargs, path=path) res.obj = load_object(res.obj) return res
def operation_from_func(to_wrap, func_to_execute, out_type, out_name, args_specifications, f_spec=None, method_type=None, first_arg=None, cache_on=None): """ In the case of methods, to_wrap is not the same to func_to_execute :param to_wrap: See `GenericDecorator.create_decorated` for an explanation :param func_to_execute: See `GenericDecorator.create_decorated` for an explanation :param cache_on: A data store onto which the operation should be cached :return: """ f_spec = f_spec or inspect.getargspec(to_wrap) out_name = out_name or to_wrap.__name__ # TODO: find the first_arg where the method was defined if method_type == 'instance' and not isinstance(first_arg, Spec): # Only when it's an instance of Spec we can identify out_name = '{}@{}'.format(out_name, id(first_arg)) default_values = get_default_values(f_spec) attrs = {} binded_pos = 0 unbinded_pos = 0 for arg in f_spec.args: if method_type == 'instance' and arg == 'self': continue if method_type == 'class' and arg == 'cls': continue if arg in args_specifications: spec = args_specifications[arg] if inspect.isclass(spec) and issubclass(spec, Spec): spec = SpecField(base_type=spec) # It can be either a class, or the instance itself if inspect.isclass(spec) or inspect.isfunction(spec): spec = spec() if isinstance(spec, UnboundField): spec.pos = unbinded_pos unbinded_pos += 1 else: spec.pos = binded_pos binded_pos += 1 else: spec = PrimitiveField(binded_pos) binded_pos += 1 if arg in default_values: spec.default = default_values[arg] attrs[arg] = spec def get_this_args(self, runner=None): this_args = {} for k, v in attrs.iteritems(): value = getattr(self, k) if isinstance(v, BaseSpecField) and runner is not None and isinstance( value, Operation): value = runner.execute(value) this_args[k] = value return this_args def to_dict(self, include_all=False): res = super(out_type, self).to_dict(include_all=include_all) if method_type is not None: res['type'] = get_import_path(first_arg, func_to_execute.__name__) else: res['type'] = get_import_path(func_to_execute) return res @property def self(self): if method_type is None: raise RuntimeError( 'Can only be called with an operation created from a method') return first_arg def apply(self, runner): this_args = self.get_this_args(runner) return func_to_execute(**this_args) cls_attrs = attrs.copy() cls_attrs['func'] = staticmethod(func_to_execute) cls_attrs['apply'] = apply cls_attrs['get_this_args'] = get_this_args cls_attrs['to_dict'] = to_dict cls_attrs['self'] = self cls = Operation.type2spec_class(out_name) if cls is None: # if the class does not exist, create it cls = type(out_name, (out_type, ), cls_attrs) else: # otherwise update it for k, v in cls_attrs.iteritems(): setattr(cls, k, v) if cache_on is not None: cls.default_data_store = cache_on else: cls.default_data_store = None cls.__module__ = to_wrap.__module__ return cls
class BaseDataStore(OperationRunner): """ Base class for all data stores, to implement a backend you need to implement _get, save and iteritems methods The _get is the actual get procedure, the caching strategy is part of the DataStore implementation """ get_cache_size = NumericField(default=0) verbose = PrimitiveField(default=False, serialize=False) def __init__(self, *args, **kwargs): """ Instances the data store. :param get_cache_size: Size of the FIFO cache for serialization """ super(BaseDataStore, self).__init__(*args, **kwargs) if self.get_cache_size > 0: self.get_cache = FifoCache(self.get_cache_size) else: self.get_cache = None @classmethod def get_key(cls, spec): if isinstance(spec, Spec): return spec.key else: assert isinstance(spec, dict) return Spec._dict2key(spec) def get(self, spec): """ Gets an operation from this data store. """ def _get(): try: return self._get(spec) except KeyError, e: # TODO: I don't like puting RehashUI.ignored_specs here if config.interactive_rehash and spec not in RehashUI.ignored_specs: # Interactive rehash has been enabled and this spec has not been processed # Trigger interactive rehash if self.interactive_rehash(spec): # If we did an interactive rehash, retry the get return self.get(spec) else: raise e else: raise e if self.get_cache is None: return _get() else: try: return self.get_cache[spec] except KeyError: res = _get() self.get_cache.set(spec, res) return res
class BaseDataStore(OperationRunner): """ Base class for all data stores, to implement a backend you need to implement _get, save and iteritems methods The _get is the actual get procedure, the caching strategy is part of the DataStore implementation """ get_cache_size = NumericField(default=0) verbose = PrimitiveField(default=False, serialize=False) def __init__(self, *args, **kwargs): """ Instances the data store. :param get_cache_size: Size of the FIFO cache for serialization """ super(BaseDataStore, self).__init__(*args, **kwargs) if self.get_cache_size > 0: self.get_cache = FifoCache(self.get_cache_size) else: self.get_cache = None def get(self, spec): """ Gets an operation from this data store. If you provide a string, it is assumed to be a `Get` """ if self.get_cache is None: return self._get(spec) else: try: return self.get_cache[spec] except KeyError: res = self._get(spec) self.get_cache.set(spec, res) return res def _get(self, spec): """ Abstract method, actual implementation of the fetch from the data_store """ raise NotImplementedError() def get_by_id(self, id): """ Fetches the value given some id. The id is implementation specific """ raise NotImplementedError() def save(self, spec, object): """ Actual implementation that saves an object associated with the id or operation """ raise NotImplementedError() def iteritems(self): """ Iterates over the datastore :return: An iterator over (operation, object) pairs """ raise NotImplementedError() def iterkeys(self, raw=False): """ Iterates over the keys of the data store :param raw: Whether to return raw documents or specs """ raise NotImplementedError() def __getitem__(self, spec): return self.get(spec) def __setitem__(self, spec, object): self.save(spec, object) def get_or_none(self, spec): try: return self.get(spec) except KeyError: return None def __contains__(self, spec): return self.get_or_none(spec) is not None def autosave(self, *args, **kwargs): kwargs['cache_on'] = self return AutosavedFunction(*args, **kwargs) def refactor(self, refactor_operation, out_data_store, permissive=False): # TODO: rewrite iterkeys, it's horrible! for id, doc in self.iterkeys(raw=True): try: refactored_doc = refactor_operation.bind(doc=doc).execute() spec = Spec.dict2spec(refactored_doc) out_data_store[spec] = self.get_by_id(id) except Exception, e: if permissive: warnings.warn(' '.join(e.args)) else: raise e
class FieldChange(Spec): original_value = PrimitiveField(0) new_value = PrimitiveField(1)
class Diff(Spec): spec_type = PrimitiveField() added_fields = CollectionField() removed_fields = CollectionField() changed_fields = SpecCollection() field = PrimitiveField( default=None, help='Specifies that this diff only applies to this field') @classmethod def build(cls, old_dict, new_dict, field=None): added_fields = { k: v for k, v in new_dict.iteritems() if k not in old_dict } removed_fields = [k for k in old_dict if k not in new_dict] changed_fields = {} subdiffs = [] for k in old_dict: if k not in new_dict: continue old_v = old_dict[k] new_v = new_dict[k] # TODO this fails if a param starts being a spec, or a param stops being a spec if (isinstance(old_v, dict) and 'type' in old_v and isinstance(new_v, dict) and 'type' in new_v and old_v != new_v): # old_v and new_v refer to specs if field is None: subdiff_field = k else: subdiff_field = '{}.{}'.format(field, k) subdiff = Diff.build(old_v, new_v, field=subdiff_field) subdiffs.append(subdiff) elif old_v != new_v: changed_fields[k] = FieldChange(old_v, new_v) diff = Diff(spec_type=old_dict['type'], added_fields=added_fields, removed_fields=removed_fields, changed_fields=changed_fields, field=field) if subdiffs: subdiffs.insert(0, diff) return ChainedDiff(subdiffs) else: return diff def create_refactor(self): res = StorageRefactor() for field, value in self.added_fields.iteritems(): res = res.add_field(self.spec_type, field, value) for field in self.removed_fields: res = res.remove_field(self.spec_type, field) for field, field_change in self.changed_fields.iteritems(): res = res.change_field(self.spec_type, field, field_change.original_value, field_change.new_value) if self.field and not res.empty: res = res.project(self.field) return res def __repr__(self): def colorize(str, color): return Cmd.colorcodes[color][True] + str + Cmd.colorcodes[color][ False] res = StringIO() if self.field is None: template = '{}:\n\t{}' join_template = '\n\t' else: res.write("On field '{}'\n".format(self.field)) template = '\t{}:\n\t\t{}' join_template = '\n\t\t' if self.added_fields: res.write( template.format( colorize('Added fields', 'green'), join_template.join( '{}:\t{}'.format(k, v) for k, v in self.added_fields.iteritems()))) if self.removed_fields: if res.len > 0: res.write('\n\n') res.write( template.format(colorize('Removed fields', 'red'), join_template.join(self.removed_fields))) if self.changed_fields: if res.len > 0: res.write('\n\n') res.write('{}:\n'.format(colorize('Changed fields', 'yellow'))) res.write('\t{:<15} {:<45} {:<45}'.format('field', 'from value', 'to value')) for field, field_change in self.changed_fields.iteritems(): res.write('{}{:<15} {:<45} {:<45}'.format( join_template, field, repr(field_change.original_value), repr(field_change.new_value))) return res.getvalue()