コード例 #1
0
ファイル: base.py プロジェクト: peymanity/datascope
class Processor(object):

    DEFAULT_ARGS_TYPE = ArgumentsTypes.NORMAL
    ARGS_NORMAL_METHODS = []
    ARGS_BATCH_METHODS = []

    config = ConfigurationProperty(storage_attribute="_config",
                                   defaults=DEFAULT_CONFIGURATION,
                                   private=[],
                                   namespace='global')

    def __init__(self, config):
        assert isinstance(
            config, dict), "Processor expects to always get a configuration."
        self.config = config

    def get_processor_method(self, method_name):
        if method_name in self.ARGS_NORMAL_METHODS:
            args_type = ArgumentsTypes.NORMAL
        elif method_name in self.ARGS_BATCH_METHODS:
            args_type = ArgumentsTypes.BATCH
        else:
            args_type = self.DEFAULT_ARGS_TYPE
        return getattr(self, method_name), args_type

    @staticmethod
    def get_processor_class(processor_name):
        core_config = apps.get_app_config("core")
        return core_config.get_processor_class(processor_name)
コード例 #2
0
ファイル: shell.py プロジェクト: tstikvoort/datagrowth
class ShellResourceProcessor(ResourceProcessor):

    ARGS_BATCH_METHODS = ['run_mass']

    config = ConfigurationProperty(storage_attribute="_config",
                                   defaults=DEFAULT_CONFIGURATION,
                                   private=[
                                       "_resource",
                                   ],
                                   namespace="shell_resource")

    #######################################################
    # TASKS
    #######################################################
    # Wrappers that act as an interface
    # to background retrieval of resources

    @property
    def run(self):
        return run.s(config=self.config.to_dict(private=True, protected=True))

    @property
    def run_mass(self):
        return run_serie.s(
            config=self.config.to_dict(private=True, protected=True))
コード例 #3
0
ファイル: manifest.py プロジェクト: tstikvoort/datagrowth
class ManifestProcessor(ResourceProcessor):

    ARGS_BATCH_METHODS = ['manifest_mass']

    config = ConfigurationProperty(storage_attribute="_config",
                                   defaults=DEFAULT_CONFIGURATION,
                                   private=[],
                                   namespace="manifest_processor")

    def __init__(self, config):
        config.update({"_resource": "Manifestation"})
        super(ManifestProcessor, self).__init__(config)
        assert "_community" in config or "community" in config, \
            "ManifestProcessor expects a community that it should manifest in the configuration."

    def manifest_from_individuals(self, individuals):
        for individual in individuals:
            args = Individual.output_from_content(individual, self.config.args)
            kwargs = Individual.output_from_content(individual,
                                                    self.config.kwargs)
            manifest(config=self.config, *args, **kwargs)
            yield individual

    @property
    def manifest_mass(self):
        return manifest_serie.s(
            config=self.config.to_dict(private=True, protected=True))
コード例 #4
0
ファイル: http.py プロジェクト: tstikvoort/datagrowth
class HttpResourceProcessor(ResourceProcessor):

    ARGS_BATCH_METHODS = ['fetch_mass', 'submit_mass']

    config = ConfigurationProperty(
        storage_attribute="_config",
        defaults=DEFAULT_CONFIGURATION,
        private=["_resource", "_continuation_limit", "_batch_size"],
        namespace="http_resource")

    #######################################################
    # Getters
    #######################################################

    @classmethod
    def get_session(cls, config):
        return requests.Session()

    #######################################################
    # TASKS
    #######################################################
    # Wrappers that act as an interface
    # to background retrieval of resources

    @property
    def fetch(self):
        return send.s(method="get",
                      config=self.config.to_dict(private=True, protected=True),
                      session=self.__class__.__name__)

    @property
    def fetch_mass(self):
        return send_mass.s(method="get",
                           config=self.config.to_dict(private=True,
                                                      protected=True),
                           session=self.__class__.__name__)

    @property
    def submit(self):
        return send.s(method="post",
                      config=self.config.to_dict(private=True, protected=True),
                      session=self.__class__.__name__)

    @property
    def submit_mass(self):
        return send_mass.s(method="post",
                           config=self.config.to_dict(private=True,
                                                      protected=True),
                           session=self.__class__.__name__)
コード例 #5
0
class ManifestProcessor(Processor):

    ARGS_BATCH_METHODS = ['manifest_mass']

    config = ConfigurationProperty(
        storage_attribute="_config",
        defaults=DEFAULT_CONFIGURATION,
        private=[],
        namespace="manifest_processor"
    )

    def __init__(self, config):
        super(ManifestProcessor, self).__init__(config)
        assert "_community" in config or "community" in config, \
            "ManifestProcessor expects a community that it should manifest in the configuration."
        self._community = None

    def manifest_from_individuals(self, individuals):
        for individual in individuals:
            args = Individual.output_from_content(individual, self.config.args)
            kwargs = Individual.output_from_content(individual, self.config.kwargs)
            manifest(config=self.config, *args, **kwargs)
            yield individual

    @staticmethod
    def async_results(result_id):
        async_result = AsyncResult(result_id)
        if not async_result.ready():
            raise DSProcessUnfinished("Result with id {} is not ready.".format(result_id))
        if async_result.status != TaskStates.SUCCESS:
            raise DSProcessError("An error occurred during background processing.")
        return async_result.result

    def results(self, result):
        scc_ids, err_ids = result
        scc = Manifestation.objects.filter(id__in=scc_ids)
        err = Manifestation.objects.filter(id__in=err_ids)
        return scc, err

    @property
    def manifest_mass(self):
        return manifest_serie.s(
            config=self.config.to_dict(private=True, protected=True)
        )
コード例 #6
0
ファイル: configuration.py プロジェクト: peymanity/datascope
 def setUpClass(cls):
     super(TestConfigurationProperty, cls).setUpClass()
     cls.property = ConfigurationProperty("storage",
                                          namespace="name",
                                          private=["_test3"],
                                          defaults=MOCK_CONFIGURATION)
コード例 #7
0
ファイル: configuration.py プロジェクト: peymanity/datascope
class ConfigurationPropertyHolder(object):
    property = ConfigurationProperty("storage",
                                     namespace="name",
                                     private=["_test3"],
                                     defaults=MOCK_CONFIGURATION)
コード例 #8
0
ファイル: extraction.py プロジェクト: peymanity/datascope
class ExtractProcessor(Processor):

    config = ConfigurationProperty(storage_attribute="_config",
                                   defaults=DEFAULT_CONFIGURATION,
                                   private=["_objective"],
                                   namespace="extract_processor")

    def __init__(self, config):
        super(ExtractProcessor, self).__init__(config)
        self._at = None
        self._context = {}
        self._objective = {}
        if "_objective" in config or "objective" in config:
            self.load_objective(self.config.objective)

    def load_objective(self, objective):
        assert isinstance(objective, dict), "An objective should be a dict."
        for key, value in six.iteritems(objective):
            if key == "@":
                self._at = value
            elif key.startswith("#"):
                self._context.update({key[1:]: value})
            else:
                self._objective.update({key: value})
        assert self._at, \
            "ExtractProcessor did not load elements to start with from its objective {}. " \
            "Make sure that '@' is specified".format(objective)
        assert self._objective, "No objectives loaded from objective {}".format(
            objective)

    def pass_resource_through(self, resource):
        mime_type, data = resource.content
        return data

    def extract_from_resource(self, resource):
        return self.extract(*resource.content)

    def extract(self, content_type, data):
        assert self.config.objective, \
            "ExtractProcessor.extract expects an objective to extract in the configuration."
        content_type_method = content_type.replace("/", "_")
        method = getattr(self, content_type_method, None)
        if method is not None:
            return method(data)
        else:
            raise TypeError(
                "Extract processor does not support content_type {}".format(
                    content_type))

    def application_json(self, data):
        context = {}
        for name, objective in six.iteritems(self._context):
            context[name] = reach(objective, data)

        nodes = reach(self._at, data)
        if isinstance(nodes, dict):
            nodes = six.itervalues(nodes)

        if nodes is None:
            raise DSNoContent("Found no nodes at {}".format(self._at))

        for node in nodes:
            result = copy(context)
            for name, objective in six.iteritems(self._objective):
                result[name] = reach(objective, node)
            yield result

    def text_html(self, soup):  # soup used in eval!

        context = {}
        for name, objective in six.iteritems(self._context):
            context[name] = eval(objective) if objective else objective

        at = elements = eval(self._at)
        if not isinstance(at, list):
            elements = [at]

        for el in elements:  # el used in eval!
            result = copy(context)
            for name, objective in six.iteritems(self._objective):
                result[name] = eval(objective) if objective else objective
            yield result
コード例 #9
0
ファイル: processor.py プロジェクト: tstikvoort/datagrowth
class MockProcessor(Processor):

    config = ConfigurationProperty(storage_attribute="_config",
                                   defaults=MOCK_CONFIGURATION,
                                   namespace="mock_processor",
                                   private=[])
コード例 #10
0
ファイル: rank.py プロジェクト: tstikvoort/datagrowth
class LegacyRankProcessor(Processor):

    config = ConfigurationProperty(
        storage_attribute="_config",
        defaults=DEFAULT_CONFIGURATION,
        private=[],
        namespace="rank_processor"
    )

    def score(self, individuals):
        warnings.warn("The RankProcessor.score method is deprecated. Use by_feature instead.", DeprecationWarning)
        sort_key = lambda el: el.get(self.config.score_key, 0)
        results = []
        batch = []

        def flush_batch(batch, result_size):
            sorted_batch = sorted(batch, key=sort_key, reverse=True)[:result_size]
            results.append(sorted_batch)

        for idx, individual in enumerate(individuals):
            if not idx % self.config.batch_size and len(batch):
                flush_batch(batch, self.config.result_size)
                batch = []
            batch.append(individual)

        flush_batch(batch, self.config.result_size)
        return islice(merge_iter(*results, key=sort_key, reversed=True), self.config.result_size)

    def get_hook_arguments(self, individual):
        return (deepcopy(individual),)

    def hooks(self, individuals):
        config_dict = self.config.to_dict()
        hooks = [
            getattr(self, hook[1:])
            for hook, weight in six.iteritems(config_dict)  # config gets whitelisted by Community
            if isinstance(hook, str) and hook.startswith("$") and callable(getattr(self, hook[1:], None)) and weight
        ]
        sort_key = lambda el: el["_rank"].get("rank", 0)
        results = []
        batch = []

        def flush_batch(batch, result_size):
            sorted_batch = sorted(batch, key=sort_key, reverse=True)[:result_size]
            results.append(sorted_batch)

        for idx, individual in enumerate(individuals):
            # Get ranks from modules
            rank_info = {hook.__name__: {"rank": 0.0} for hook in hooks}
            for hook in hooks:
                hook_name = hook.__name__
                try:
                    hook_result = hook(*self.get_hook_arguments(individual))
                    module_value = float(hook_result)
                    module_weight = float(config_dict["$"+hook_name])
                except (ValueError, TypeError):
                    continue
                if module_value is None:
                    continue
                rank_info[hook_name] = {
                    "rank": module_value * module_weight,
                    "value": module_value,
                    "weight": module_weight
                }
            # Aggregate all ranks to a single rank
            hook_rankings = [ranking for ranking in six.itervalues(rank_info) if ranking["rank"]]
            if hook_rankings:
                rank_info["rank"] = reduce(
                    lambda reduced, hook_rank_info: reduced + hook_rank_info["rank"],
                    hook_rankings,
                    0
                )
            # Set info on individual and write batch to results when appropriate
            individual['_rank'] = rank_info
            if not idx % self.config.batch_size and len(batch):
                flush_batch(batch, self.config.result_size)
                batch = []
            # Append ranked individual to batch
            batch.append(individual)

        flush_batch(batch, self.config.result_size)
        return islice(merge_iter(*results, key=sort_key, reversed=True), self.config.result_size)
コード例 #11
0
ファイル: rank.py プロジェクト: tstikvoort/datagrowth
class RankProcessor(QuerySetProcessor):

    config = ConfigurationProperty(
        storage_attribute="_config",
        defaults=DEFAULT_CONFIGURATION,
        private=[],
        namespace="rank_processor"
    )

    contextual_features = []

    def __init__(self, config):
        super().__init__(config)
        if "identifier_key" in self.config and "feature_frame_path" in self.config:
            self.feature_frame = NumericFeaturesFrame(
                identifier=lambda ind: ind[self.config.identifier_key],
                features=self.get_features(),
                file_path=self.config.feature_frame_path
            )
        else:
            self.feature_frame = None
        if "identifier_key" in self.config and "text_frame_path" in self.config and "language" in self.config:
            self.text_frame = TextFeaturesFrame(
                get_identifier=lambda ind: ind[self.config.identifier_key],
                get_text=self.get_text,
                language=self.config.language,
                file_path=self.config.text_frame_path
            )
        else:
            self.text_frame = None

    @staticmethod
    def get_text(document):
        raise NotImplementedError("The get_text method should be implemented in its context")

    @classmethod
    def get_features(cls):
        mother = set(dir(RankProcessor))
        own = set(dir(cls))
        return [
            getattr(cls, attr) for attr in (own - mother)
            if callable(getattr(cls, attr)) and
            attr not in cls.contextual_features
        ]

    def get_ranking_results(self, ranking, query_set, series):

        # TODO: assert identity? how?
        max_size = self.config.result_size

        if query_set.count() >= len(ranking):
            results = list(query_set.filter(identity__in=ranking.index[:max_size]))
        else:
            results = list(query_set)
        results.sort(key=lambda entry: ranking.at[entry.identity], reverse=True)
        results = results[:max_size]

        for individual in results:
            ix = individual[self.config.identifier_key]
            content = individual.content
            content["_rank"] = {
                "rank": ranking.at[ix]
            }
            for serie in series:
                value = serie.at[ix]
                content["_rank"][serie.name] = {
                    "rank": value,  # TODO: rank value should be multiplied by weight
                    "value": value,
                    "weight": 1.0
                }
            yield content

    def default_ranking(self, query_set):
        raise NotImplementedError("The default_ranking method should be implemented in its context")

    def by_feature(self, query_set):
        assert "ranking_feature" in self.config, "RankProcessor.by_feature needs a ranking_feature from config"
        assert self.feature_frame, \
            "RankProcessor needs a identifier_key and feature_frame_path configuration " \
            "to perform RankProcessor.by_feature"
        ranking_feature = self.config.ranking_feature
        assert ranking_feature in self.feature_frame.features or ranking_feature in self.contextual_features, \
            "The non-contextual feature '{}' is not loaded in the feature frame".format(ranking_feature)
        if ranking_feature not in self.contextual_features:
            ranked_feature = self.feature_frame.data[ranking_feature]
        else:
            ranked_feature = self.feature_frame.get_feature_series(
                ranking_feature, getattr(self, ranking_feature),
                content_callable=query_set.iterator, context=self.config.to_dict()
            )
        ranked_feature = ranked_feature.fillna(0).sort_values(ascending=False)
        return self.get_ranking_results(ranked_feature, query_set, [ranked_feature])

    def by_params(self, individuals):
        pass
コード例 #12
0
class HttpResourceProcessor(Processor):
    # TODO: make sphinx friendly and doc all methods
    """
    A collection of Celery tasks that share their need for specific a configuration.
    Each task should return a single list of ids to be further handled classes like Growth.

    The configuration must include
    - a HttpResource class name to be loaded with Django
    - a guideline to how deep a single resource should collect data
    """

    ARGS_BATCH_METHODS = ['fetch_mass', 'submit_mass']

    config = ConfigurationProperty(
        storage_attribute="_config",
        defaults=DEFAULT_CONFIGURATION,
        private=["_resource", "_continuation_limit", "_batch_size"],
        namespace="http_resource"
    )

    def __init__(self, config):
        super(HttpResourceProcessor, self).__init__(config)
        assert "_resource" in config or "resource" in config, \
            "HttpResourceProcessor expects a resource that it should fetch in the configuration."
        self._resource = None

    #######################################################
    # Interface
    #######################################################

    @staticmethod
    def async_results(result_id):
        async_result = AsyncResult(result_id)
        if not async_result.ready():
            raise DSProcessUnfinished("Result with id {} is not ready.".format(result_id))
        if async_result.status != TaskStates.SUCCESS:
            raise DSProcessError("An error occurred during background processing.")
        return async_result.result

    def results(self, result):
        scc_ids, err_ids = result
        scc = self.resource.objects.filter(id__in=scc_ids)
        err = self.resource.objects.filter(id__in=err_ids)
        return scc, err

    #######################################################
    # Getters
    #######################################################

    @property
    def resource(self):
        if not self._resource:
            self._resource = get_any_model(self.config.resource)
        return self._resource

    @classmethod
    def get_session(cls, config):
        return requests.Session()

    #######################################################
    # TASKS
    #######################################################
    # Wrappers that act as an interface
    # to background retrieval of resources

    @property
    def fetch(self):
        return send.s(
            method="get",
            config=self.config.to_dict(private=True, protected=True),
            session=self.__class__.__name__
        )

    @property
    def fetch_mass(self):
        return send_mass.s(
            method="get",
            config=self.config.to_dict(private=True, protected=True),
            session=self.__class__.__name__
        )

    @property
    def submit(self):
        return send.s(
            method="post",
            config=self.config.to_dict(private=True, protected=True),
            session=self.__class__.__name__
        )

    @property
    def submit_mass(self):
        return send_mass.s(
            method="post",
            config=self.config.to_dict(private=True, protected=True),
            session=self.__class__.__name__
        )