class SelectionSearch: def __init__(self, form): """ Form *must* be valid before passing. @type form: SelectionForm """ self.es = ES() self.form = form self.data = SelectionData(form.cleaned_data) def _get_filters(self): """ Get filters for dates, articlesets and articles for given form. Yields iterables of tuples containing (filter_name, filter_value). @type form: SelectionForm """ yield get_date_filters( self.data.start_date, self.data.end_date, self.data.on_date, self.data.datetype ) yield (("sets", [a.id for a in self.data.articlesets]),) yield (("ids", self.data.article_ids or None),) @cached def get_filters(self): """Returns dict with filter -> value, which can be passed to elastic""" # Remove all filters which value is None return {k: v for k, v in chain(*self._get_filters()) if v is not None} @cached def get_query(self): """ @rtype: str """ return ' OR '.join('(%s)' % q.query for q in self.get_queries()) or None @cached def get_queries(self): """Get SearchQuery objects @rtype: iterable of SearchQuery""" if not self.data.query: return [] codebook = self.data.codebook label_lan = self.data.codebook_label_language replacement_lan = self.data.codebook_replacement_language if codebook: codebook.cache_labels() queries = map(str.strip, self.data.query.split("\n")) #filter empty lines queries = filter(lambda x: x, queries) queries = map(SearchQuery.from_string, queries) resolved = resolve_queries( list(queries), codebook=codebook, label_language=label_lan, replacement_language=replacement_lan ) return [q for q in resolved if not q.label.startswith("_")] @cached def get_count(self): try: return self.es.count(self.get_query(), self.get_filters()) except queryparser.QueryParseError: # try queries one by one for i, q in enumerate(self.get_queries()): queryparser.parse_to_terms(q.query, context=(q.declared_label or i+1)) # if error wasn't raised yet, re-raise original raise @cached def get_statistics(self): return self.es.statistics(self.get_query(), self.get_filters()) def get_aggregate(self, categories, flat=True, objects=True): # If we're aggregating on terms, we don't want a global filter query = None if not any(isinstance(c, TermCategory) for c in categories): query = self.get_query() aggr = aggregate(query, self.get_filters(), categories, flat=flat, objects=objects) return sorted(aggr, key=to_sortable_tuple) def get_nested_aggregate(self, categories): return to_nested(self.get_aggregate(categories)) def get_article_ids(self): return ES().query_ids(self.get_query(), self.get_filters()) def _get_article_ids_per_query(self): for q in self.get_queries(): yield q, list(ES().query_ids(q.query, self.get_filters())) def get_article_ids_per_query(self): return dict(self._get_article_ids_per_query()) def get_articles(self, size=None, offset=0, fields=()): return ES().query(self.get_query(), self.get_filters(), True, size=size, from_=offset, fields=fields)
class SelectionSearch: """ """ def __init__(self, form): """ Form *must* be valid before passing. @type form: SelectionForm """ self.es = ES() self.form = form self.data = SelectionData(form.cleaned_data) def _get_filters(self): """ Get filters for dates, mediums, articlesets and articles for given form. Yields iterables of tuples containing (filter_name, filter_value). @type form: SelectionForm """ yield get_date_filters( self.data.start_date, self.data.end_date, self.data.on_date, self.data.datetype ) yield (("mediumid", [m.id for m in self.data.mediums]),) yield (("sets", [a.id for a in self.data.articlesets]),) yield (("ids", self.data.article_ids or None),) @cached def get_filters(self): """Returns dict with filter -> value, which can be passed to elastic""" # Remove all filters which value is None return {k: v for k, v in chain(*self._get_filters()) if v is not None} @cached def get_query(self): """ @rtype: unicode """ return ' OR '.join('(%s)' % q.query for q in self.get_queries()) or None @cached def get_queries(self): """Get SearchQuery objects @rtype: iterable of SearchQuery""" if not self.data.query: return [] codebook = self.data.codebook label_lan = self.data.codebook_label_language replacement_lan = self.data.codebook_replacement_language if codebook: codebook.cache_labels() queries = map(unicode.strip, self.data.query.split("\n")) queries = map(SearchQuery.from_string, queries) resolved = resolve_queries( queries, codebook=codebook, label_language=label_lan, replacement_language=replacement_lan ) return [q for q in resolved if not q.label.startswith("_")] @cached def get_count(self): return self.es.count(self.get_query(), self.get_filters()) @cached def get_statistics(self): return self.es.statistics(self.get_query(), self.get_filters()) @cached def get_mediums(self): return Medium.objects.filter(id__in=self.get_medium_ids()) def get_aggregate(self, x_axis, y_axis, interval="month"): x_axis = FIELD_MAP.get(x_axis, x_axis) y_axis = FIELD_MAP.get(y_axis, y_axis) if y_axis == "total": group_by = [x_axis] else: group_by = [x_axis, y_axis] query = None if "term" in (x_axis, y_axis) else self.get_query() aggr = ES().aggregate_query( query=query, terms=self.get_queries(), filters=self.get_filters(), group_by=group_by, date_interval=interval, sets=map(attrgetter("id"), self.data.articlesets) ) aggr = get_mediums(aggr, list(group_by)) aggr = get_articlesets(aggr, list(group_by)) return aggr def get_medium_ids(self): return self.es.list_media(self.get_query(), self.get_filters()) def get_article_ids(self): return ES().query_ids(self.get_query(), self.get_filters()) def _get_article_ids_per_query(self): for q in self.get_queries(): yield q, list(ES().query_ids(q.query, self.get_filters())) def get_article_ids_per_query(self): return dict(self._get_article_ids_per_query()) def get_articles(self, size=None, offset=0): """ """ article_ids = self.get_article_ids() if size is not None: article_ids = islice(article_ids, offset, size + offset) # Return in order article_ids = tuple(article_ids) article_dict = Article.objects.in_bulk(article_ids) return (article_dict[pk] for pk in article_ids)
class SelectionSearch: def __init__(self, form): """ Form *must* be valid before passing. @type form: SelectionForm """ self.es = ES() self.form = form self.data = SelectionData(form) def _get_set_filters(self): yield "sets", [a.id for a in self.data.articlesets] def _get_filters(self) -> Iterable[Tuple[str, Any]]: """ Get filters for dates, articlesets and articles for given form. Yields iterables of tuples containing (filter_name, filter_value). @type form: SelectionForm """ if self.data.start_date is not None: yield "start_date", self.data.start_date if self.data.end_date is not None: yield "end_date", self.data.end_date yield "ids", self.data.article_ids or None yield from self._get_set_filters() if self.data.filters: for filter in self.data.filters: yield from filter.get_filter_kwargs() @cached def get_filters(self): """Returns dict with filter -> value, which can be passed to elastic""" # Remove all filters which value is None return {k: v for k, v in self._get_filters() if v is not None} @cached def get_query(self): """ @rtype: str """ return ' OR '.join('(%s)' % q.query for q in self.get_queries()) or None @cached def get_queries(self): """Get SearchQuery objects @rtype: iterable of SearchQuery""" if not self.data.query: return [] codebook = self.data.codebook label_lan = self.data.codebook_label_language replacement_lan = self.data.codebook_replacement_language if codebook: codebook.cache_labels() queries = map(str.strip, self.data.query.split("\n")) # filter empty lines queries = filter(lambda x: x, queries) queries = map(SearchQuery.from_string, queries) resolved = resolve_queries( list(queries), codebook=codebook, label_language=label_lan, replacement_language=replacement_lan ) return [q for q in resolved if not q.label.startswith("_")] @cached def get_count(self): try: return self.es.count(self.get_query(), self.get_filters()) except queryparser.QueryParseError: # try queries one by one for i, q in enumerate(self.get_queries()): queryparser.parse_to_terms(q.query, context=(q.declared_label or i + 1)) # if error wasn't raised yet, re-raise original raise @cached def get_statistics(self): return self.es.statistics(self.get_query(), self.get_filters()) def get_aggregate(self, categories, flat=True, objects=True): # If we're aggregating on terms, we don't want a global filter query = None if not any(isinstance(c, TermCategory) for c in categories): query = self.get_query() return aggregate(query, self.get_filters(), categories, flat=flat, objects=objects) def get_nested_aggregate(self, categories): return to_nested(self.get_aggregate(categories)) def get_article_ids(self): return ES().query_ids(self.get_query(), self.get_filters()) def _get_article_ids_per_query(self): for q in self.get_queries(): yield q, list(ES().query_ids(q.query, self.get_filters())) def get_article_ids_per_query(self): return dict(self._get_article_ids_per_query()) def get_articles(self, size=None, offset=0, fields=(), **kwargs): return ES().query(self.get_query(), self.get_filters(), True, size=size, from_=offset, _source=fields, **kwargs) @staticmethod def get_instance(form): """ Gets a SelectionSearch instance depending on the selection data. If codingjobs are given, a CodingJobSelectionSearch is returned. :param form: A SelectionForm :return: An instance of SelectionSearch that is appropriate for the given SelectionForm. """ data = SelectionData(form) if data.codingjobs: return CodingJobSelectionSearch(form) if data.articlesets: return SelectionSearch(form) raise Exception("Invalid selection: no articlesets or codingjobs given.")
class SelectionSearch: def __init__(self, form): """ Form *must* be valid before passing. @type form: SelectionForm """ self.es = ES() self.form = form self.data = SelectionData(form.cleaned_data) def _get_filters(self): """ Get filters for dates, mediums, articlesets and articles for given form. Yields iterables of tuples containing (filter_name, filter_value). @type form: SelectionForm """ yield get_date_filters( self.data.start_date, self.data.end_date, self.data.on_date, self.data.datetype ) yield (("mediumid", [m.id for m in self.data.mediums]),) yield (("sets", [a.id for a in self.data.articlesets]),) yield (("ids", self.data.article_ids or None),) @cached def get_filters(self): """Returns dict with filter -> value, which can be passed to elastic""" # Remove all filters which value is None return {k: v for k, v in chain(*self._get_filters()) if v is not None} @cached def get_query(self): """ @rtype: str """ return ' OR '.join('(%s)' % q.query for q in self.get_queries()) or None @cached def get_queries(self): """Get SearchQuery objects @rtype: iterable of SearchQuery""" if not self.data.query: return [] codebook = self.data.codebook label_lan = self.data.codebook_label_language replacement_lan = self.data.codebook_replacement_language if codebook: codebook.cache_labels() queries = map(str.strip, self.data.query.split("\n")) #filter empty lines queries = filter(lambda x: x, queries) queries = map(SearchQuery.from_string, queries) resolved = resolve_queries( list(queries), codebook=codebook, label_language=label_lan, replacement_language=replacement_lan ) return [q for q in resolved if not q.label.startswith("_")] @cached def get_count(self): try: return self.es.count(self.get_query(), self.get_filters()) except queryparser.QueryParseError: # try queries one by one for i, q in enumerate(self.get_queries()): queryparser.parse_to_terms(q.query, context=(q.declared_label or i+1)) # if error wasn't raised yet, re-raise original raise @cached def get_statistics(self): return self.es.statistics(self.get_query(), self.get_filters()) @cached def get_mediums(self): return Medium.objects.filter(id__in=self.get_medium_ids()) def get_aggregate(self, categories, flat=True): # If we're aggregating on terms, we don't want a global filter query = None if not any(isinstance(c, TermCategory) for c in categories): query = self.get_query() aggr = aggregate(query, self.get_filters(), categories, flat=flat) return sorted(aggr, key=to_sortable_tuple) def get_nested_aggregate(self, categories): return to_nested(self.get_aggregate(categories)) def get_medium_ids(self): return self.es.list_media(self.get_query(), self.get_filters()) def get_article_ids(self): return ES().query_ids(self.get_query(), self.get_filters()) def _get_article_ids_per_query(self): for q in self.get_queries(): yield q, list(ES().query_ids(q.query, self.get_filters())) def get_article_ids_per_query(self): return dict(self._get_article_ids_per_query()) def get_articles(self, size=None, offset=0): """ """ query = self.get_query() lead = not query fields = ['headline','text','date', 'length','medium','author','section'] return ES().query(query, self.get_filters(), True, size=size, from_=offset, fields=fields, lead=lead)
class SelectionSearch: """ """ def __init__(self, form): """ Form *must* be valid before passing. @type form: SelectionForm """ self.es = ES() self.form = form self.data = SelectionData(form.cleaned_data) def _get_filters(self): """ Get filters for dates, mediums, articlesets and articles for given form. Yields iterables of tuples containing (filter_name, filter_value). @type form: SelectionForm """ yield get_date_filters(self.data.start_date, self.data.end_date, self.data.on_date, self.data.datetype) yield (("mediumid", [m.id for m in self.data.mediums]),) yield (("sets", [a.id for a in self.data.articlesets]),) yield (("ids", self.data.article_ids or None),) @cached def get_filters(self): """Returns dict with filter -> value, which can be passed to elastic""" # Remove all filters which value is None return {k: v for k, v in chain(*self._get_filters()) if v is not None} @cached def get_query(self): """ @rtype: unicode """ return " OR ".join("(%s)" % q.query for q in self.get_queries()) or None @cached def get_queries(self): """Get SearchQuery objects @rtype: iterable of SearchQuery""" if not self.data.query: return [] codebook = self.data.codebook label_lan = self.data.codebook_label_language replacement_lan = self.data.codebook_replacement_language if codebook: codebook.cache_labels() queries = map(unicode.strip, self.data.query.split("\n")) queries = map(SearchQuery.from_string, queries) resolved = resolve_queries( queries, codebook=codebook, label_language=label_lan, replacement_language=replacement_lan ) return [q for q in resolved if not q.label.startswith("_")] @cached def get_count(self): return self.es.count(self.get_query(), self.get_filters()) @cached def get_statistics(self): return self.es.statistics(self.get_query(), self.get_filters()) @cached def get_mediums(self): return Medium.objects.filter(id__in=self.get_medium_ids()) def get_aggregate(self, x_axis, y_axis, interval="month"): x_axis = FIELD_MAP.get(x_axis, x_axis) y_axis = FIELD_MAP.get(y_axis, y_axis) if y_axis == "total": group_by = [x_axis] else: group_by = [x_axis, y_axis] query = None if "term" in (x_axis, y_axis) else self.get_query() aggr = ES().aggregate_query( query=query, terms=self.get_queries(), filters=self.get_filters(), group_by=group_by, date_interval=interval, sets=map(attrgetter("id"), self.data.articlesets), ) aggr = get_mediums(aggr, list(group_by)) aggr = get_articlesets(aggr, list(group_by)) return aggr def get_medium_ids(self): return self.es.list_media(self.get_query(), self.get_filters()) def get_article_ids(self): return ES().query_ids(self.get_query(), self.get_filters()) def _get_article_ids_per_query(self): for q in self.get_queries(): yield q, list(ES().query_ids(q.query, self.get_filters())) def get_article_ids_per_query(self): return dict(self._get_article_ids_per_query()) def get_articles(self, size=None, offset=0): """ """ article_ids = self.get_article_ids() if size is not None: article_ids = islice(article_ids, offset, size + offset) # Return in order article_ids = tuple(article_ids) article_dict = Article.objects.in_bulk(article_ids) return (article_dict[pk] for pk in article_ids)