class Processor(object): DEFAULT_ARGS_TYPE = ArgumentsTypes.NORMAL ARGS_NORMAL_METHODS = [] ARGS_BATCH_METHODS = [] config = ConfigurationProperty(storage_attribute="_config", defaults=DEFAULT_CONFIGURATION, private=[], namespace='global') def __init__(self, config): assert isinstance( config, dict), "Processor expects to always get a configuration." self.config = config def get_processor_method(self, method_name): if method_name in self.ARGS_NORMAL_METHODS: args_type = ArgumentsTypes.NORMAL elif method_name in self.ARGS_BATCH_METHODS: args_type = ArgumentsTypes.BATCH else: args_type = self.DEFAULT_ARGS_TYPE return getattr(self, method_name), args_type @staticmethod def get_processor_class(processor_name): core_config = apps.get_app_config("core") return core_config.get_processor_class(processor_name)
class ShellResourceProcessor(ResourceProcessor): ARGS_BATCH_METHODS = ['run_mass'] config = ConfigurationProperty(storage_attribute="_config", defaults=DEFAULT_CONFIGURATION, private=[ "_resource", ], namespace="shell_resource") ####################################################### # TASKS ####################################################### # Wrappers that act as an interface # to background retrieval of resources @property def run(self): return run.s(config=self.config.to_dict(private=True, protected=True)) @property def run_mass(self): return run_serie.s( config=self.config.to_dict(private=True, protected=True))
class ManifestProcessor(ResourceProcessor): ARGS_BATCH_METHODS = ['manifest_mass'] config = ConfigurationProperty(storage_attribute="_config", defaults=DEFAULT_CONFIGURATION, private=[], namespace="manifest_processor") def __init__(self, config): config.update({"_resource": "Manifestation"}) super(ManifestProcessor, self).__init__(config) assert "_community" in config or "community" in config, \ "ManifestProcessor expects a community that it should manifest in the configuration." def manifest_from_individuals(self, individuals): for individual in individuals: args = Individual.output_from_content(individual, self.config.args) kwargs = Individual.output_from_content(individual, self.config.kwargs) manifest(config=self.config, *args, **kwargs) yield individual @property def manifest_mass(self): return manifest_serie.s( config=self.config.to_dict(private=True, protected=True))
class HttpResourceProcessor(ResourceProcessor): ARGS_BATCH_METHODS = ['fetch_mass', 'submit_mass'] config = ConfigurationProperty( storage_attribute="_config", defaults=DEFAULT_CONFIGURATION, private=["_resource", "_continuation_limit", "_batch_size"], namespace="http_resource") ####################################################### # Getters ####################################################### @classmethod def get_session(cls, config): return requests.Session() ####################################################### # TASKS ####################################################### # Wrappers that act as an interface # to background retrieval of resources @property def fetch(self): return send.s(method="get", config=self.config.to_dict(private=True, protected=True), session=self.__class__.__name__) @property def fetch_mass(self): return send_mass.s(method="get", config=self.config.to_dict(private=True, protected=True), session=self.__class__.__name__) @property def submit(self): return send.s(method="post", config=self.config.to_dict(private=True, protected=True), session=self.__class__.__name__) @property def submit_mass(self): return send_mass.s(method="post", config=self.config.to_dict(private=True, protected=True), session=self.__class__.__name__)
class ManifestProcessor(Processor): ARGS_BATCH_METHODS = ['manifest_mass'] config = ConfigurationProperty( storage_attribute="_config", defaults=DEFAULT_CONFIGURATION, private=[], namespace="manifest_processor" ) def __init__(self, config): super(ManifestProcessor, self).__init__(config) assert "_community" in config or "community" in config, \ "ManifestProcessor expects a community that it should manifest in the configuration." self._community = None def manifest_from_individuals(self, individuals): for individual in individuals: args = Individual.output_from_content(individual, self.config.args) kwargs = Individual.output_from_content(individual, self.config.kwargs) manifest(config=self.config, *args, **kwargs) yield individual @staticmethod def async_results(result_id): async_result = AsyncResult(result_id) if not async_result.ready(): raise DSProcessUnfinished("Result with id {} is not ready.".format(result_id)) if async_result.status != TaskStates.SUCCESS: raise DSProcessError("An error occurred during background processing.") return async_result.result def results(self, result): scc_ids, err_ids = result scc = Manifestation.objects.filter(id__in=scc_ids) err = Manifestation.objects.filter(id__in=err_ids) return scc, err @property def manifest_mass(self): return manifest_serie.s( config=self.config.to_dict(private=True, protected=True) )
def setUpClass(cls): super(TestConfigurationProperty, cls).setUpClass() cls.property = ConfigurationProperty("storage", namespace="name", private=["_test3"], defaults=MOCK_CONFIGURATION)
class ConfigurationPropertyHolder(object): property = ConfigurationProperty("storage", namespace="name", private=["_test3"], defaults=MOCK_CONFIGURATION)
class ExtractProcessor(Processor): config = ConfigurationProperty(storage_attribute="_config", defaults=DEFAULT_CONFIGURATION, private=["_objective"], namespace="extract_processor") def __init__(self, config): super(ExtractProcessor, self).__init__(config) self._at = None self._context = {} self._objective = {} if "_objective" in config or "objective" in config: self.load_objective(self.config.objective) def load_objective(self, objective): assert isinstance(objective, dict), "An objective should be a dict." for key, value in six.iteritems(objective): if key == "@": self._at = value elif key.startswith("#"): self._context.update({key[1:]: value}) else: self._objective.update({key: value}) assert self._at, \ "ExtractProcessor did not load elements to start with from its objective {}. " \ "Make sure that '@' is specified".format(objective) assert self._objective, "No objectives loaded from objective {}".format( objective) def pass_resource_through(self, resource): mime_type, data = resource.content return data def extract_from_resource(self, resource): return self.extract(*resource.content) def extract(self, content_type, data): assert self.config.objective, \ "ExtractProcessor.extract expects an objective to extract in the configuration." content_type_method = content_type.replace("/", "_") method = getattr(self, content_type_method, None) if method is not None: return method(data) else: raise TypeError( "Extract processor does not support content_type {}".format( content_type)) def application_json(self, data): context = {} for name, objective in six.iteritems(self._context): context[name] = reach(objective, data) nodes = reach(self._at, data) if isinstance(nodes, dict): nodes = six.itervalues(nodes) if nodes is None: raise DSNoContent("Found no nodes at {}".format(self._at)) for node in nodes: result = copy(context) for name, objective in six.iteritems(self._objective): result[name] = reach(objective, node) yield result def text_html(self, soup): # soup used in eval! context = {} for name, objective in six.iteritems(self._context): context[name] = eval(objective) if objective else objective at = elements = eval(self._at) if not isinstance(at, list): elements = [at] for el in elements: # el used in eval! result = copy(context) for name, objective in six.iteritems(self._objective): result[name] = eval(objective) if objective else objective yield result
class MockProcessor(Processor): config = ConfigurationProperty(storage_attribute="_config", defaults=MOCK_CONFIGURATION, namespace="mock_processor", private=[])
class LegacyRankProcessor(Processor): config = ConfigurationProperty( storage_attribute="_config", defaults=DEFAULT_CONFIGURATION, private=[], namespace="rank_processor" ) def score(self, individuals): warnings.warn("The RankProcessor.score method is deprecated. Use by_feature instead.", DeprecationWarning) sort_key = lambda el: el.get(self.config.score_key, 0) results = [] batch = [] def flush_batch(batch, result_size): sorted_batch = sorted(batch, key=sort_key, reverse=True)[:result_size] results.append(sorted_batch) for idx, individual in enumerate(individuals): if not idx % self.config.batch_size and len(batch): flush_batch(batch, self.config.result_size) batch = [] batch.append(individual) flush_batch(batch, self.config.result_size) return islice(merge_iter(*results, key=sort_key, reversed=True), self.config.result_size) def get_hook_arguments(self, individual): return (deepcopy(individual),) def hooks(self, individuals): config_dict = self.config.to_dict() hooks = [ getattr(self, hook[1:]) for hook, weight in six.iteritems(config_dict) # config gets whitelisted by Community if isinstance(hook, str) and hook.startswith("$") and callable(getattr(self, hook[1:], None)) and weight ] sort_key = lambda el: el["_rank"].get("rank", 0) results = [] batch = [] def flush_batch(batch, result_size): sorted_batch = sorted(batch, key=sort_key, reverse=True)[:result_size] results.append(sorted_batch) for idx, individual in enumerate(individuals): # Get ranks from modules rank_info = {hook.__name__: {"rank": 0.0} for hook in hooks} for hook in hooks: hook_name = hook.__name__ try: hook_result = hook(*self.get_hook_arguments(individual)) module_value = float(hook_result) module_weight = float(config_dict["$"+hook_name]) except (ValueError, TypeError): continue if module_value is None: continue rank_info[hook_name] = { "rank": module_value * module_weight, "value": module_value, "weight": module_weight } # Aggregate all ranks to a single rank hook_rankings = [ranking for ranking in six.itervalues(rank_info) if ranking["rank"]] if hook_rankings: rank_info["rank"] = reduce( lambda reduced, hook_rank_info: reduced + hook_rank_info["rank"], hook_rankings, 0 ) # Set info on individual and write batch to results when appropriate individual['_rank'] = rank_info if not idx % self.config.batch_size and len(batch): flush_batch(batch, self.config.result_size) batch = [] # Append ranked individual to batch batch.append(individual) flush_batch(batch, self.config.result_size) return islice(merge_iter(*results, key=sort_key, reversed=True), self.config.result_size)
class RankProcessor(QuerySetProcessor): config = ConfigurationProperty( storage_attribute="_config", defaults=DEFAULT_CONFIGURATION, private=[], namespace="rank_processor" ) contextual_features = [] def __init__(self, config): super().__init__(config) if "identifier_key" in self.config and "feature_frame_path" in self.config: self.feature_frame = NumericFeaturesFrame( identifier=lambda ind: ind[self.config.identifier_key], features=self.get_features(), file_path=self.config.feature_frame_path ) else: self.feature_frame = None if "identifier_key" in self.config and "text_frame_path" in self.config and "language" in self.config: self.text_frame = TextFeaturesFrame( get_identifier=lambda ind: ind[self.config.identifier_key], get_text=self.get_text, language=self.config.language, file_path=self.config.text_frame_path ) else: self.text_frame = None @staticmethod def get_text(document): raise NotImplementedError("The get_text method should be implemented in its context") @classmethod def get_features(cls): mother = set(dir(RankProcessor)) own = set(dir(cls)) return [ getattr(cls, attr) for attr in (own - mother) if callable(getattr(cls, attr)) and attr not in cls.contextual_features ] def get_ranking_results(self, ranking, query_set, series): # TODO: assert identity? how? max_size = self.config.result_size if query_set.count() >= len(ranking): results = list(query_set.filter(identity__in=ranking.index[:max_size])) else: results = list(query_set) results.sort(key=lambda entry: ranking.at[entry.identity], reverse=True) results = results[:max_size] for individual in results: ix = individual[self.config.identifier_key] content = individual.content content["_rank"] = { "rank": ranking.at[ix] } for serie in series: value = serie.at[ix] content["_rank"][serie.name] = { "rank": value, # TODO: rank value should be multiplied by weight "value": value, "weight": 1.0 } yield content def default_ranking(self, query_set): raise NotImplementedError("The default_ranking method should be implemented in its context") def by_feature(self, query_set): assert "ranking_feature" in self.config, "RankProcessor.by_feature needs a ranking_feature from config" assert self.feature_frame, \ "RankProcessor needs a identifier_key and feature_frame_path configuration " \ "to perform RankProcessor.by_feature" ranking_feature = self.config.ranking_feature assert ranking_feature in self.feature_frame.features or ranking_feature in self.contextual_features, \ "The non-contextual feature '{}' is not loaded in the feature frame".format(ranking_feature) if ranking_feature not in self.contextual_features: ranked_feature = self.feature_frame.data[ranking_feature] else: ranked_feature = self.feature_frame.get_feature_series( ranking_feature, getattr(self, ranking_feature), content_callable=query_set.iterator, context=self.config.to_dict() ) ranked_feature = ranked_feature.fillna(0).sort_values(ascending=False) return self.get_ranking_results(ranked_feature, query_set, [ranked_feature]) def by_params(self, individuals): pass
class HttpResourceProcessor(Processor): # TODO: make sphinx friendly and doc all methods """ A collection of Celery tasks that share their need for specific a configuration. Each task should return a single list of ids to be further handled classes like Growth. The configuration must include - a HttpResource class name to be loaded with Django - a guideline to how deep a single resource should collect data """ ARGS_BATCH_METHODS = ['fetch_mass', 'submit_mass'] config = ConfigurationProperty( storage_attribute="_config", defaults=DEFAULT_CONFIGURATION, private=["_resource", "_continuation_limit", "_batch_size"], namespace="http_resource" ) def __init__(self, config): super(HttpResourceProcessor, self).__init__(config) assert "_resource" in config or "resource" in config, \ "HttpResourceProcessor expects a resource that it should fetch in the configuration." self._resource = None ####################################################### # Interface ####################################################### @staticmethod def async_results(result_id): async_result = AsyncResult(result_id) if not async_result.ready(): raise DSProcessUnfinished("Result with id {} is not ready.".format(result_id)) if async_result.status != TaskStates.SUCCESS: raise DSProcessError("An error occurred during background processing.") return async_result.result def results(self, result): scc_ids, err_ids = result scc = self.resource.objects.filter(id__in=scc_ids) err = self.resource.objects.filter(id__in=err_ids) return scc, err ####################################################### # Getters ####################################################### @property def resource(self): if not self._resource: self._resource = get_any_model(self.config.resource) return self._resource @classmethod def get_session(cls, config): return requests.Session() ####################################################### # TASKS ####################################################### # Wrappers that act as an interface # to background retrieval of resources @property def fetch(self): return send.s( method="get", config=self.config.to_dict(private=True, protected=True), session=self.__class__.__name__ ) @property def fetch_mass(self): return send_mass.s( method="get", config=self.config.to_dict(private=True, protected=True), session=self.__class__.__name__ ) @property def submit(self): return send.s( method="post", config=self.config.to_dict(private=True, protected=True), session=self.__class__.__name__ ) @property def submit_mass(self): return send_mass.s( method="post", config=self.config.to_dict(private=True, protected=True), session=self.__class__.__name__ )