Esempio n. 1
0
class SelectionSearch:
    def __init__(self, form):
        """
        Form *must* be valid before passing.
        @type form: SelectionForm
        """
        self.es = ES()
        self.form = form
        self.data = SelectionData(form.cleaned_data)

    def _get_filters(self):
        """
        Get filters for dates,  articlesets and articles for given form. Yields
        iterables of tuples containing (filter_name, filter_value).

        @type form: SelectionForm
        """
        yield get_date_filters(
            self.data.start_date, self.data.end_date,
            self.data.on_date, self.data.datetype
        )

        yield (("sets", [a.id for a in self.data.articlesets]),)
        yield (("ids", self.data.article_ids or None),)

    @cached
    def get_filters(self):
        """Returns dict with filter -> value, which can be passed to elastic"""
        # Remove all filters which value is None
        return {k: v for k, v in chain(*self._get_filters()) if v is not None}

    @cached
    def get_query(self):
        """
        @rtype: str
        """
        return ' OR '.join('(%s)' % q.query for q in self.get_queries()) or None

    @cached
    def get_queries(self):
        """Get SearchQuery objects

        @rtype: iterable of SearchQuery"""
        if not self.data.query:
            return []
        codebook = self.data.codebook
        label_lan = self.data.codebook_label_language
        replacement_lan = self.data.codebook_replacement_language

        if codebook:
            codebook.cache_labels()

        queries = map(str.strip, self.data.query.split("\n"))
        #filter empty lines
        queries = filter(lambda x: x, queries)
        queries = map(SearchQuery.from_string, queries)

        resolved = resolve_queries(
            list(queries), codebook=codebook,
            label_language=label_lan,
            replacement_language=replacement_lan
        )

        return [q for q in resolved if not q.label.startswith("_")]

    @cached
    def get_count(self):
        try:
            return self.es.count(self.get_query(), self.get_filters())
        except queryparser.QueryParseError:
            # try queries one by one
            for i, q in enumerate(self.get_queries()):
                queryparser.parse_to_terms(q.query, context=(q.declared_label or i+1))
            # if error wasn't raised yet, re-raise original
            raise

    @cached
    def get_statistics(self):
        return self.es.statistics(self.get_query(), self.get_filters())

    def get_aggregate(self, categories, flat=True, objects=True):
        # If we're aggregating on terms, we don't want a global filter
        query = None
        if not any(isinstance(c, TermCategory) for c in categories):
            query = self.get_query()

        aggr = aggregate(query, self.get_filters(), categories, flat=flat, objects=objects)
        return sorted(aggr, key=to_sortable_tuple)

    def get_nested_aggregate(self, categories):
        return to_nested(self.get_aggregate(categories))

    def get_article_ids(self):
        return ES().query_ids(self.get_query(), self.get_filters())

    def _get_article_ids_per_query(self):
        for q in self.get_queries():
            yield q, list(ES().query_ids(q.query, self.get_filters()))

    def get_article_ids_per_query(self):
        return dict(self._get_article_ids_per_query())

    def get_articles(self, size=None, offset=0, fields=()):
        return ES().query(self.get_query(), self.get_filters(), True, size=size, from_=offset, fields=fields)
Esempio n. 2
0
class SelectionSearch:
    """

    """

    def __init__(self, form):
        """
        Form *must* be valid before passing.
        @type form: SelectionForm
        """
        self.es = ES()
        self.form = form
        self.data = SelectionData(form.cleaned_data)

    def _get_filters(self):
        """
        Get filters for dates, mediums, articlesets and articles for given form. Yields
        iterables of tuples containing (filter_name, filter_value).

        @type form: SelectionForm
        """
        yield get_date_filters(
            self.data.start_date, self.data.end_date,
            self.data.on_date, self.data.datetype
        )

        yield (("mediumid", [m.id for m in self.data.mediums]),)
        yield (("sets", [a.id for a in self.data.articlesets]),)
        yield (("ids", self.data.article_ids or None),)

    @cached
    def get_filters(self):
        """Returns dict with filter -> value, which can be passed to elastic"""
        # Remove all filters which value is None
        return {k: v for k, v in chain(*self._get_filters()) if v is not None}

    @cached
    def get_query(self):
        """
        @rtype: unicode
        """
        return ' OR '.join('(%s)' % q.query for q in self.get_queries()) or None

    @cached
    def get_queries(self):
        """Get SearchQuery objects

        @rtype: iterable of SearchQuery"""
        if not self.data.query:
            return []

        codebook = self.data.codebook
        label_lan = self.data.codebook_label_language
        replacement_lan = self.data.codebook_replacement_language

        if codebook:
            codebook.cache_labels()

        queries = map(unicode.strip, self.data.query.split("\n"))
        queries = map(SearchQuery.from_string, queries)

        resolved = resolve_queries(
            queries, codebook=codebook,
            label_language=label_lan,
            replacement_language=replacement_lan
        )

        return [q for q in resolved if not q.label.startswith("_")]

    @cached
    def get_count(self):
        return self.es.count(self.get_query(), self.get_filters())

    @cached
    def get_statistics(self):
        return self.es.statistics(self.get_query(), self.get_filters())

    @cached
    def get_mediums(self):
        return Medium.objects.filter(id__in=self.get_medium_ids())

    def get_aggregate(self, x_axis, y_axis, interval="month"):
        x_axis = FIELD_MAP.get(x_axis, x_axis)
        y_axis = FIELD_MAP.get(y_axis, y_axis)

        if y_axis == "total":
            group_by = [x_axis]
        else:
            group_by = [x_axis, y_axis]

        query = None if "term" in (x_axis, y_axis) else self.get_query()

        aggr = ES().aggregate_query(
            query=query, terms=self.get_queries(),
            filters=self.get_filters(), group_by=group_by,
            date_interval=interval, sets=map(attrgetter("id"), self.data.articlesets)
        )

        aggr = get_mediums(aggr, list(group_by))
        aggr = get_articlesets(aggr, list(group_by))

        return aggr

    def get_medium_ids(self):
        return self.es.list_media(self.get_query(), self.get_filters())

    def get_article_ids(self):
        return ES().query_ids(self.get_query(), self.get_filters())

    def _get_article_ids_per_query(self):
        for q in self.get_queries():
            yield q, list(ES().query_ids(q.query, self.get_filters()))

    def get_article_ids_per_query(self):
        return dict(self._get_article_ids_per_query())

    def get_articles(self, size=None, offset=0):
        """

        """
        article_ids = self.get_article_ids()
        if size is not None:
            article_ids = islice(article_ids, offset, size + offset)

        # Return in order
        article_ids = tuple(article_ids)
        article_dict = Article.objects.in_bulk(article_ids)
        return (article_dict[pk] for pk in article_ids)
Esempio n. 3
0
class SelectionSearch:
    def __init__(self, form):
        """
        Form *must* be valid before passing.
        @type form: SelectionForm
        """
        self.es = ES()
        self.form = form
        self.data = SelectionData(form)

    def _get_set_filters(self):
        yield "sets", [a.id for a in self.data.articlesets]

    def _get_filters(self) -> Iterable[Tuple[str, Any]]:
        """
        Get filters for dates,  articlesets and articles for given form. Yields
        iterables of tuples containing (filter_name, filter_value).

        @type form: SelectionForm
        """

        if self.data.start_date is not None:
            yield "start_date", self.data.start_date

        if self.data.end_date is not None:
            yield "end_date", self.data.end_date

        yield "ids", self.data.article_ids or None

        yield from self._get_set_filters()

        if self.data.filters:
            for filter in self.data.filters:
                yield from filter.get_filter_kwargs()

    @cached
    def get_filters(self):
        """Returns dict with filter -> value, which can be passed to elastic"""
        # Remove all filters which value is None
        return {k: v for k, v in self._get_filters() if v is not None}

    @cached
    def get_query(self):
        """
        @rtype: str
        """
        return ' OR '.join('(%s)' % q.query for q in self.get_queries()) or None

    @cached
    def get_queries(self):
        """Get SearchQuery objects

        @rtype: iterable of SearchQuery"""
        if not self.data.query:
            return []
        codebook = self.data.codebook
        label_lan = self.data.codebook_label_language
        replacement_lan = self.data.codebook_replacement_language

        if codebook:
            codebook.cache_labels()

        queries = map(str.strip, self.data.query.split("\n"))
        # filter empty lines
        queries = filter(lambda x: x, queries)
        queries = map(SearchQuery.from_string, queries)

        resolved = resolve_queries(
            list(queries), codebook=codebook,
            label_language=label_lan,
            replacement_language=replacement_lan
        )

        return [q for q in resolved if not q.label.startswith("_")]

    @cached
    def get_count(self):
        try:
            return self.es.count(self.get_query(), self.get_filters())
        except queryparser.QueryParseError:
            # try queries one by one
            for i, q in enumerate(self.get_queries()):
                queryparser.parse_to_terms(q.query, context=(q.declared_label or i + 1))
            # if error wasn't raised yet, re-raise original
            raise

    @cached
    def get_statistics(self):
        return self.es.statistics(self.get_query(), self.get_filters())

    def get_aggregate(self, categories, flat=True, objects=True):
        # If we're aggregating on terms, we don't want a global filter
        query = None
        if not any(isinstance(c, TermCategory) for c in categories):
            query = self.get_query()

        return aggregate(query, self.get_filters(), categories, flat=flat, objects=objects)

    def get_nested_aggregate(self, categories):
        return to_nested(self.get_aggregate(categories))

    def get_article_ids(self):
        return ES().query_ids(self.get_query(), self.get_filters())

    def _get_article_ids_per_query(self):
        for q in self.get_queries():
            yield q, list(ES().query_ids(q.query, self.get_filters()))

    def get_article_ids_per_query(self):
        return dict(self._get_article_ids_per_query())

    def get_articles(self, size=None, offset=0, fields=(), **kwargs):
        return ES().query(self.get_query(), self.get_filters(), True, size=size, from_=offset, _source=fields, **kwargs)

    @staticmethod
    def get_instance(form):
        """
        Gets a SelectionSearch instance depending on the selection data.
        If codingjobs are given, a CodingJobSelectionSearch is returned.

        :param form: A SelectionForm
        :return: An instance of SelectionSearch that is appropriate for the given SelectionForm.
        """
        data = SelectionData(form)
        if data.codingjobs:
            return CodingJobSelectionSearch(form)
        if data.articlesets:
            return SelectionSearch(form)

        raise Exception("Invalid selection: no articlesets or codingjobs given.")
Esempio n. 4
0
class SelectionSearch:
    def __init__(self, form):
        """
        Form *must* be valid before passing.
        @type form: SelectionForm
        """
        self.es = ES()
        self.form = form
        self.data = SelectionData(form.cleaned_data)

    def _get_filters(self):
        """
        Get filters for dates, mediums, articlesets and articles for given form. Yields
        iterables of tuples containing (filter_name, filter_value).

        @type form: SelectionForm
        """
        yield get_date_filters(
            self.data.start_date, self.data.end_date,
            self.data.on_date, self.data.datetype
        )

        yield (("mediumid", [m.id for m in self.data.mediums]),)
        yield (("sets", [a.id for a in self.data.articlesets]),)
        yield (("ids", self.data.article_ids or None),)

    @cached
    def get_filters(self):
        """Returns dict with filter -> value, which can be passed to elastic"""
        # Remove all filters which value is None
        return {k: v for k, v in chain(*self._get_filters()) if v is not None}

    @cached
    def get_query(self):
        """
        @rtype: str
        """
        return ' OR '.join('(%s)' % q.query for q in self.get_queries()) or None

    @cached
    def get_queries(self):
        """Get SearchQuery objects

        @rtype: iterable of SearchQuery"""
        if not self.data.query:
            return []

        codebook = self.data.codebook
        label_lan = self.data.codebook_label_language
        replacement_lan = self.data.codebook_replacement_language

        if codebook:
            codebook.cache_labels()

        queries = map(str.strip, self.data.query.split("\n"))
        #filter empty lines
        queries = filter(lambda x: x, queries)
        queries = map(SearchQuery.from_string, queries)

        resolved = resolve_queries(
            list(queries), codebook=codebook,
            label_language=label_lan,
            replacement_language=replacement_lan
        )

        return [q for q in resolved if not q.label.startswith("_")]

    @cached
    def get_count(self):
        try:
            return self.es.count(self.get_query(), self.get_filters())
        except queryparser.QueryParseError:
            # try queries one by one
            for i, q in enumerate(self.get_queries()):
                queryparser.parse_to_terms(q.query, context=(q.declared_label or i+1))
            # if error wasn't raised yet, re-raise original
            raise

    @cached
    def get_statistics(self):
        return self.es.statistics(self.get_query(), self.get_filters())

    @cached
    def get_mediums(self):
        return Medium.objects.filter(id__in=self.get_medium_ids())

    def get_aggregate(self, categories, flat=True):
        # If we're aggregating on terms, we don't want a global filter
        query = None
        if not any(isinstance(c, TermCategory) for c in categories):
            query = self.get_query()

        aggr = aggregate(query, self.get_filters(), categories, flat=flat)
        return sorted(aggr, key=to_sortable_tuple)

    def get_nested_aggregate(self, categories):
        return to_nested(self.get_aggregate(categories))

    def get_medium_ids(self):
        return self.es.list_media(self.get_query(), self.get_filters())

    def get_article_ids(self):
        return ES().query_ids(self.get_query(), self.get_filters())

    def _get_article_ids_per_query(self):
        for q in self.get_queries():
            yield q, list(ES().query_ids(q.query, self.get_filters()))

    def get_article_ids_per_query(self):
        return dict(self._get_article_ids_per_query())

    def get_articles(self, size=None, offset=0):
        """

        """
        query = self.get_query()
        lead = not query
        fields = ['headline','text','date', 'length','medium','author','section']
        return ES().query(query, self.get_filters(), True, size=size, from_=offset, fields=fields, lead=lead)
Esempio n. 5
0
class SelectionSearch:
    """

    """

    def __init__(self, form):
        """
        Form *must* be valid before passing.
        @type form: SelectionForm
        """
        self.es = ES()
        self.form = form
        self.data = SelectionData(form.cleaned_data)

    def _get_filters(self):
        """
        Get filters for dates, mediums, articlesets and articles for given form. Yields
        iterables of tuples containing (filter_name, filter_value).

        @type form: SelectionForm
        """
        yield get_date_filters(self.data.start_date, self.data.end_date, self.data.on_date, self.data.datetype)

        yield (("mediumid", [m.id for m in self.data.mediums]),)
        yield (("sets", [a.id for a in self.data.articlesets]),)
        yield (("ids", self.data.article_ids or None),)

    @cached
    def get_filters(self):
        """Returns dict with filter -> value, which can be passed to elastic"""
        # Remove all filters which value is None
        return {k: v for k, v in chain(*self._get_filters()) if v is not None}

    @cached
    def get_query(self):
        """
        @rtype: unicode
        """
        return " OR ".join("(%s)" % q.query for q in self.get_queries()) or None

    @cached
    def get_queries(self):
        """Get SearchQuery objects

        @rtype: iterable of SearchQuery"""
        if not self.data.query:
            return []

        codebook = self.data.codebook
        label_lan = self.data.codebook_label_language
        replacement_lan = self.data.codebook_replacement_language

        if codebook:
            codebook.cache_labels()

        queries = map(unicode.strip, self.data.query.split("\n"))
        queries = map(SearchQuery.from_string, queries)

        resolved = resolve_queries(
            queries, codebook=codebook, label_language=label_lan, replacement_language=replacement_lan
        )

        return [q for q in resolved if not q.label.startswith("_")]

    @cached
    def get_count(self):
        return self.es.count(self.get_query(), self.get_filters())

    @cached
    def get_statistics(self):
        return self.es.statistics(self.get_query(), self.get_filters())

    @cached
    def get_mediums(self):
        return Medium.objects.filter(id__in=self.get_medium_ids())

    def get_aggregate(self, x_axis, y_axis, interval="month"):
        x_axis = FIELD_MAP.get(x_axis, x_axis)
        y_axis = FIELD_MAP.get(y_axis, y_axis)

        if y_axis == "total":
            group_by = [x_axis]
        else:
            group_by = [x_axis, y_axis]

        query = None if "term" in (x_axis, y_axis) else self.get_query()

        aggr = ES().aggregate_query(
            query=query,
            terms=self.get_queries(),
            filters=self.get_filters(),
            group_by=group_by,
            date_interval=interval,
            sets=map(attrgetter("id"), self.data.articlesets),
        )

        aggr = get_mediums(aggr, list(group_by))
        aggr = get_articlesets(aggr, list(group_by))

        return aggr

    def get_medium_ids(self):
        return self.es.list_media(self.get_query(), self.get_filters())

    def get_article_ids(self):
        return ES().query_ids(self.get_query(), self.get_filters())

    def _get_article_ids_per_query(self):
        for q in self.get_queries():
            yield q, list(ES().query_ids(q.query, self.get_filters()))

    def get_article_ids_per_query(self):
        return dict(self._get_article_ids_per_query())

    def get_articles(self, size=None, offset=0):
        """

        """
        article_ids = self.get_article_ids()
        if size is not None:
            article_ids = islice(article_ids, offset, size + offset)

        # Return in order
        article_ids = tuple(article_ids)
        article_dict = Article.objects.in_bulk(article_ids)
        return (article_dict[pk] for pk in article_ids)