def more_like_this(self, request, pk=None, id=None): """More-like-this functionality detail view. :param request: :return: """ if 'view' in request.parser_context: view = request.parser_context['view'] kwargs = copy.copy(getattr(view, 'more_like_this_options', {})) id_ = pk if pk else id # Use current queryset queryset = self.filter_queryset(self.get_queryset()) # We do not try to get fields from current serializer. On the # Elasticsearch side if no ``fields`` value is given, ``_all`` is # used, and although some serializers could contain less fields # than available, this seems like the best approach. If you want to # fall back to ``_all`` of Elasticsearch, leave it empty. fields = kwargs.pop('fields', []) # if not fields: # serializer_class = self.get_serializer_class() # fields = serializer_class.Meta.fields[:] if fields: queryset = queryset.query( MoreLikeThis( fields=fields, like={ '_id': "{}".format(id_), '_index': "{}".format(self.index), '_type': "{}".format(self.mapping) }, **kwargs ) ).sort('_score') else: queryset = queryset.query( MoreLikeThis( like={ '_id': "{}".format(id_), '_index': "{}".format(self.index), '_type': "{}".format(self.mapping) }, **kwargs ) ).sort('_score') # Standard list-view implementation page = self.paginate_queryset(queryset) if page is not None: serializer = self.get_serializer(page, many=True) return self.get_paginated_response(serializer.data) serializer = self.get_serializer(queryset, many=True) return Response(serializer.data)
def more_like_this(self, item, max_hits=3): query = MoreLikeThis( like=[ # {'_id': ElasticIndex._get_id(item), '_index': self.index_name}, item.indexable_content(), item.category_names() ], min_term_freq=1, min_doc_freq=2, max_query_terms=12, fields=[ 'title', 'content', 'description', 'location', 'category', 'organization_name', 'website' ]) elastic_search = Search(index=self.index_name)\ .doc_type(StarDocument)\ .query(query) elastic_search = elastic_search[0:max_hits] # Filter out past events elastic_search = elastic_search.filter( 'bool', **{"should": self._default_filter()}) return elastic_search.execute()
def more_like_this(self, request, pk=None, id=None): """More-like-this functionality detail view. :param request: :return: """ if 'view' in request.parser_context: view = request.parser_context['view'] kwargs = copy.copy(getattr(view, 'more_like_this_options', {})) id_ = pk if pk else id # obj = self.get_object() queryset = self.filter_queryset(self.get_queryset()) fields = kwargs.pop('fields', {}) if not fields: serializer_class = self.get_serializer_class() fields = serializer_class.Meta.fields[:] queryset = queryset.query( MoreLikeThis( fields=fields, like={ '_id': "{}".format(id_), '_index': "{}".format(self.index), '_type': "{}".format(self.mapping) }, **kwargs ) ).sort('_score') return Response(queryset.execute().to_dict())
def find_related_documents(*, user, post_id): """ Execute a "more like this" query to find posts that are related to a specific post Args: user (User): The user executing the search post_id (str): The id of the post that you want to find related posts for Returns: dict: The Elasticsearch response dict """ index = get_default_alias_name(ALIAS_ALL_INDICES) search = Search(index=index) search = _apply_general_query_filters(search, user) search = search.query( MoreLikeThis( like={ "_id": gen_post_id(post_id), "_type": GLOBAL_DOC_TYPE }, fields=RELATED_POST_RELEVANT_FIELDS, min_term_freq=1, min_doc_freq=1, )) # Limit results to the number indicated in settings search = search[0:settings.OPEN_DISCUSSIONS_RELATED_POST_COUNT] return search.execute().to_dict()
def search_more_like_this(talk): """ Get more like this documents """ client = Elasticsearch([{ 'host': settings.ELASTICSEARCH['default']['HOSTNAME'], 'port': settings.ELASTICSEARCH['default']['PORT'], }]) s = Search(using=client, index="vtalks") s = s.query( MoreLikeThis(like={ "_index": "vtalks", "_type": "talk", "_id": talk.id }, fields=['title', 'description', 'tags'])) # Sorting s = s.sort({"_score": {"order": "desc"}}) # Fields selection s = s.source(['id']) response = s.execute() results_total = response.hits.total results_ids = [hit.id for hit in response.hits] return results_total, results_ids
def find_similar_resources(*, user, value_doc): """ Execute a "more like this" query to find learning resources that are similar to the one provided. Args: user (User): The user executing the search value_doc (dict): a document representing the data fields we want to search with Returns: dict: The Elasticsearch response dict """ index = get_default_alias_name(ALIAS_ALL_INDICES) search = Search(index=index) search = _apply_general_query_filters(search, user) search = search.filter(Q("terms", object_type=LEARNING_RESOURCE_TYPES)) search = search.query( MoreLikeThis( like={ "doc": value_doc, "fields": list(value_doc.keys()) }, fields=SIMILAR_RESOURCE_RELEVANT_FIELDS, min_term_freq=settings.OPEN_RESOURCES_MIN_TERM_FREQ, min_doc_freq=settings.OPEN_RESOURCES_MIN_DOC_FREQ, )) response = search.execute() return [ hit.to_dict() for hit in response.hits if (hit["id"] != value_doc.get("id", None) or hit["object_type"] != value_doc.get("object_type", None)) ][0:settings.OPEN_DISCUSSIONS_SIMILAR_RESOURCES_COUNT]
def get_more_like_this(s, query_text): s = s.query( MoreLikeThis(like=query_text, fields=['title', 'abstract', 'body^3'], stop_words=get_stop_words())) # get first top 10 similar articles response = s[1:11].execute() return _extract_response(response)
def get_related_items(self, obj): """Get related items. :param obj: :return: """ max_query_terms = 25 min_term_freq = 1, min_doc_freq = 1 max_doc_freq = 25 kwargs = {} if max_query_terms is not None: kwargs['max_query_terms'] = max_query_terms # if min_term_freq is not None: # kwargs['min_term_freq'] = min_term_freq if min_doc_freq is not None: kwargs['min_doc_freq'] = min_doc_freq if max_doc_freq is not None: kwargs['max_doc_freq'] = max_doc_freq query = CollectionItemDocument().search() search = query.query( MoreLikeThis( fields=( 'title_en.natural', 'description_en.natural', ), like={ '_id': "{}".format(obj.id), '_index': "{}".format(INDEX._name), '_type': "{}".format(list(INDEX._mappings.keys())[0]) }, **kwargs ) ) related_items = [] for __o in search: _id = int(__o.meta.id) related_items.append( OrderedDict([ ('id', _id), ('images_urls', __o.images_urls._l_), # English ('title_en', __o.title_en._l_), ('description_en', __o.description_en._l_), # Dutch ('title_nl', __o.title_nl._l_), ('description_nl', __o.description_nl._l_), ]) ) return related_items
def more_like_this(doc): like = serialize_document(doc) q = MoreLikeThis(like=like, fields=["title", "body", "content"], min_term_freq=1, min_doc_freq=1) query = LetterDocument.search().query(q) # print(query.to_dict()) return query.execute()
def build_query_body(item_uri, media_type=None, max_duration=None, published_after=None, region=None, similarity_method=None, limit=constants.DEFAULT_QUERY_LIMIT, offset=constants.DEFAULT_QUERY_OFFSET): """Build query dict ready to pass to Elasticsearch search instance for retrieving a list of similar items given a URI.""" if published_after is not None: raise NotImplementedError('The parameter `publishedAfter` is not yet implemented.') if region is not None: raise NotImplementedError('The parameter `region` is not yet implemented.') if similarity_method is not None: raise NotImplementedError('The parameter `similarityMethod` is not yet implemented for ES.') search = Search(index='pips') search = search[offset:offset + limit] # TODO: THIS DOESNT WORK?? query builds as should but no effect if media_type: for media in media_type: search = search.filter('term', mediaType=media) if max_duration: search = search.filter('range', duration={'lte': max_duration}) similarity_filters = [ # by title MoreLikeThis( like={'_index': 'pips', '_type': 'clip', '_id': item_uri}, fields=['title', 'masterBrand.mid', 'mediaType'], min_term_freq=1, min_doc_freq=1 ), Q( 'nested', path='genres', query=MoreLikeThis( fields=['genres.key'], like={'_index': 'pips', '_type': 'clip', '_id': item_uri}, min_term_freq=1, min_doc_freq=1 ) ) ] search = search.query('bool', should=similarity_filters) return search.to_dict()
def create_mlt_with_id(document_id, position, index): s = Search(using=client, index=index) s.source(includes=['*'], excludes=["body"]) mlt_match = MoreLikeThis(fields=["body.content"], like=[id], min_term_freq=1, min_doc_freq=1) nested_query = Nested(path='body', inner_hits={}, query=mlt_match) s = s.query(nested_query) return s
def create_mlt_with_id(document_id, index, size=20): s = Search(using=client, index=index) if not isinstance(document_id, list): mlt_match = MoreLikeThis(fields=["content"], like={ '_index': index, '_id': document_id }, min_term_freq=1, min_doc_freq=1, minimum_should_match='5%', analyzer='stop') else: like_list = [{'_index': index, '_id': item} for item in document_id] mlt_match = MoreLikeThis(fields=["content"], like=like_list, min_term_freq=1, min_doc_freq=1, analyzer='stop') s = s.query(mlt_match) s = s[:size] return s
def recommend_mlt(request): """Recommend two more-like-this jokes for inclusion on another page.""" try: search = Search(index='toja_jokes') search = search.query(MoreLikeThis(like={'_id': request.matchdict['jid']})) search = search[0:2] results = search.execute() joke_ids = [joke.meta.id for joke in results] if joke_ids and len(joke_ids) >= 2: jokes = request.dbsession.query(Image).filter(Image.id.in_(joke_ids)) return {'jokes': jokes} except ConnectionError: pass raise HTTPNotFound()
def more_like_this(self, item, max_hits=3): query = MoreLikeThis( like=[ # {'_id': ElasticIndex._get_id(item), '_index': self.index_name}, item.indexable_content(), item.category_names() ], min_term_freq=1, min_doc_freq=2, max_query_terms=12, fields=[ 'title', 'content', 'description', 'location', 'category', 'organization_name', 'website' ]) elastic_search = Search(index=self.index_name)\ .doc_type(StarDocument)\ .query(query) elastic_search = elastic_search[0:max_hits] # Filter out past events elastic_search = elastic_search.filter( 'bool', **{ "should": [ { "range": { "date": { "gte": datetime.datetime.utcnow() } } }, # Future events OR { "bool": { "must_not": { "exists": { "field": "date" } } } } # Date field is empty ] }) return elastic_search.execute()
def get(self, request, format=None): """推薦相似餐廳功能""" res = {'status': False} restaurant = request.GET.get('restaurant', '') text = {'_index': 'restaurant', '_type': '_doc', '_id': restaurant} query = RestaurantDocument.search()\ .query(MoreLikeThis(like = text, fields = ['descriptions'], min_term_freq = 1, max_query_terms = 5))\ .source(excludes=['descriptions', 'created_time']) pages = [] for restaurant in query.execute(): pages.append({'title': restaurant.restaurant}) res.update({'status': True, 'data': pages}) return JsonResponse(res)
def get_similar_topics(value_doc, num_topics, min_term_freq, min_doc_freq): """ Get a list of similar topics based on text values Args: value_doc (dict): a document representing the data fields we want to search with num_topics (int): number of topics to return min_term_freq (int): minimum times a term needs to show up in input min_doc_freq (int): minimum times a term needs to show up in docs Returns: list of str: list of topic values """ index = get_default_alias_name(ALIAS_ALL_INDICES) search = Search(index=index) search = search.filter(Q("terms", object_type=[COURSE_TYPE])) search = search.query( MoreLikeThis( like=[{ "doc": value_doc, "fields": list(value_doc.keys()) }], fields=[ "course_id", "title", "short_description", "full_description" ], min_term_freq=min_term_freq, min_doc_freq=min_doc_freq, )) search = search.source(includes="topics") response = search.execute() topics = [topic for hit in response.hits for topic in hit.topics] counter = Counter(topics) return list(dict(counter.most_common(num_topics)).keys())
def more_like_this(self, doc_id): """This method takes in a doc ID and queries the elasticsearch index for courses with similar title or description""" likeObj = [{"_index": self.index, "_id": doc_id}] fields = [ "Course.CourseShortDescription", "Course.CourseTitle", "Course.CourseProvider" ] # We're going to match based only on two fields self.search = self.search.query( MoreLikeThis(like=likeObj, fields=fields)) self.user_organization_filtering() # only fetch the first 6 results # TODO: make the size configurable self.search = self.search[0:6] response = self.search.execute() logger.info(response) return response
def search(request): query = request.GET.get('q') # q = MultiMatch(query=query, fields=['title', 'body']) data = { 'query': True, 'more': NoteDocument.search().query( MoreLikeThis(like=query, fields=['title', 'body'])), 'page': NoteDocument.search().query('multi_match', query=query, fields=['title', 'body']) # .query("match", body=query) } # print(data['page']) for k in data['more']: print(k.body) print(k.title) return render(request, 'search/search.html', context=data)
def more_like_this(obj, fields, max_query_terms=25, min_term_freq=2, min_doc_freq=5, max_doc_freq=0, query=None): _index, _mapping = get_index_and_mapping_for_model(obj._meta.model) if _index is None: return None _client = connections.get_connection() _search = Search(using=_client, index=_index) if query is not None: _search = _search.query(query) kwargs = {} if max_query_terms is not None: kwargs['max_query_terms'] = max_query_terms if min_term_freq is not None: kwargs['min_term_freq'] = min_term_freq if min_doc_freq is not None: kwargs['min_doc_freq'] = min_doc_freq if max_doc_freq is not None: kwargs['max_doc_freq'] = max_doc_freq return _search.query( MoreLikeThis(fields=fields, like={ '_id': "{}".format(obj.pk), '_index': "{}".format(_index), '_type': "{}".format(_mapping) }, **kwargs))
def more_like_this(elastic_url, fields: list, like: list, size: int, filters: list, aggregations: list, include: bool, if_agg_only: bool, dataset: Dataset, return_fields=None): # Create the base query creator and unite with ES gateway. search = Search(using=Elasticsearch(elastic_url)).index(dataset.index).doc_type(dataset.mapping) mlt = MoreLikeThis(like=like, fields=fields, min_term_freq=1, max_query_terms=12, include=include) # Prepare the MLT part of the query. paginated_search = search[0:size] # Set how many documents to return. limited_search = paginated_search.source(return_fields) if return_fields else paginated_search # If added, choose which FIELDS to return. finished_search = limited_search.query(mlt) # Add the premade MLT into the query. # Apply all the user-set filters, if they didn't add any this value will be [] and it quits. for filter_dict in filters: finished_search = finished_search.filter(Q(filter_dict)) # Apply all the user-set aggregations, if they didn't add any this value will be [] and it quits. for aggregation_dict in aggregations: # aggs.bucket() does not return a Search object but changes it instead. if aggregation_dict["agg_type"] == "composite": after = aggregation_dict.get("after_key", None) finished_search = ES_Manager.handle_composition_aggregation(finished_search.to_dict(), aggregation_dict, after) else: field_name = aggregation_dict["field"] index = like[0]["_index"] field = "{}.keyword".format(field_name) if ES_Manager.is_field_text_field(field_name=field_name, index_name=index) else field_name finished_search.aggs.bucket(name=aggregation_dict["bucket_name"], agg_type=aggregation_dict["agg_type"], field=field) # Choose if you want to return only the aggregations in {"bucket_name": {results...}} format. if if_agg_only: finished_search = finished_search.params(size=0) response = finished_search.execute() return response.aggs.to_dict() try: response = finished_search.execute() result = {"hits": [hit.to_dict() for hit in response]} # Throw out all metadata and keep only the documents. if response.aggs: result.update({"aggregations": response.aggs.to_dict()}) # IF the aggregation query returned anything, THEN add the "aggregatons" key with results. return result except ElasticsearchException as e: logging.getLogger(ERROR_LOGGER).exception(e) return {"elasticsearch": [str(e)]}
def get_similar_clips(clip_id): """ Return 10 clips deemed to be most similar. """ clip = get_clip_by_id(clip_id) if clip is None: raise ClipNotFoundException(["clip_id searched for not in index."]) # Stop words taken from nltk's list of stop words. stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "into", "through", "now", "should" "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don"] dsl_search = Clip.search() similar_clips = dsl_search.query(MoreLikeThis(like={'_id': clip.meta.id}, fields=['title', 'caption', 'categories'], min_term_freq=1, stop_words=stop_words, min_doc_freq=5, minimum_should_match=0)).execute() return similar_clips
from elasticsearch import Elasticsearch from elasticsearch.exceptions import RequestError from elasticsearch_dsl import Search, Index, Document, connections, \ Keyword, Date, Text, Integer, MetaField, Nested, InnerDoc from elasticsearch_dsl.query import Nested from elasticsearch_dsl.query import Match, Nested, Term, MoreLikeThis from tabulate import tabulate client = Elasticsearch() mlt_match = MoreLikeThis(fields=["body.content"], like=["you owe me"], min_term_freq=1, min_doc_freq=1) innerMatch = Match(body__content='stock') nestedMatch = Nested(path='body', query=innerMatch) # retrieve all documents containing stock in its body) s = Search(using=client, index='enron') \ .query("match", body="stock") """ in order to change the size of return: s= s[0:0] will create a size 0 request. It's all done with python slicing. """
print("Got %d Hits:" % res['hits']['total']['value']) for hit in res['hits']['hits']: print(hit['_source']['url']) print('\n') print('Lets try to find more like: ' + str(res['hits']['hits'][0]['_source']['url'])) # We compare the contents of the html field now to find something similar # A bit hacky.. we now use the DSL version of elasticsearch. This one's # supposed to be a bit easier to use, so I wanted to see if it's easier # to define a query with this syntax (compared to above) from elasticsearch_dsl.query import MoreLikeThis from elasticsearch_dsl import Search #html my_text = str(res['hits']['hits'][0]['_source']['html']) my_text = str(res['hits']['hits'][0]['_source']['url'].replace( '/', ' ').replace('-', ' ')) #url s = Search(using=es) s = s.query(MoreLikeThis(like=my_text, fields=['url', 'html', 'title'])) # You can also exclude fields from the result to make the response quicker in the normal way # s = s.source(exclude=["sentences", "text"]) response = s.execute().to_dict() print('There are ' + str(response['hits']['total']['value']) + ' results:\n') for i in range(0, 5): print(response['hits']['hits'][i]['_source']['url']) #The recommendation probably works terrible because it now also compares html tags
def more_like_this(obj, fields, max_query_terms=25, min_term_freq=2, min_doc_freq=5, max_doc_freq=0, query=None): """More like this. https://www.elastic.co/guide/en/elasticsearch/reference/current/ query-dsl-mlt-query.html :param obj: Django model instance for which similar objects shall be found. :param fields: Fields to search in. :param max_query_terms: :param min_term_freq: :param min_doc_freq: :param max_doc_freq: :param query: Q query :type obj: Instance of `django.db.models.Model` (sub-classed) model. :type fields: list :type max_query_terms: int :type min_term_freq: int :type min_doc_freq: int :type max_doc_freq: int :type query: elasticsearch_dsl.query.Q :return: List of objects. :rtype: elasticsearch_dsl.search.Search Example: >>> from django_elasticsearch_dsl_drf.helpers import more_like_this >>> from books.models import Book >>> book = Book.objects.first() >>> similar_books = more_like_this( >>> book, >>> ['title', 'description', 'summary'] >>> ) """ _index, _mapping = get_index_and_mapping_for_model(obj._meta.model) if _index is None: return None _client = connections.get_connection() _search = Search(using=_client, index=_index) if query is not None: _search = _search.query(query) kwargs = {} if max_query_terms is not None: kwargs['max_query_terms'] = max_query_terms if min_term_freq is not None: kwargs['min_term_freq'] = min_term_freq if min_doc_freq is not None: kwargs['min_doc_freq'] = min_doc_freq if max_doc_freq is not None: kwargs['max_doc_freq'] = max_doc_freq _like_options = { '_id': "{}".format(obj.pk), '_index': "{}".format(_index), } if not ELASTICSEARCH_GTE_7_0: _like_options.update({'_type': "{}".format(_mapping)}) return _search.query( MoreLikeThis(fields=fields, like=_like_options, **kwargs))
def ejercicio2(): es = config() query = raw_input("Introduzca un termino/frase a buscar >> ") print() util.install_and_import("elasticsearch-dsl", "elasticsearch_dsl") from elasticsearch_dsl import Search from elasticsearch_dsl.query import MoreLikeThis properties = select_estadistico() est = properties[0] properties_est = properties[1] number = 25 s = Search(using=es, index="reddit-mentalhealth") s = s.query( MoreLikeThis( like=query, fields=['selftext', 'title', 'subreddit'], min_term_freq=1, max_query_terms=number, )) agg = { "significant_terms": { "field": "selftext", "size": number, est: properties_est } } s.aggs.bucket('Text', agg) agg = { "significant_terms": { "field": "subreddit", "size": number, est: properties_est } } s.aggs.bucket('Subreddit', agg) agg = { "significant_terms": { "field": "title", "size": number, est: properties_est } } s.aggs.bucket('Title', agg) results = s.execute().to_dict() stops_words = [] #quitar palabras vacias with open("stop.txt") as f: for line in f: stops_words.append(line.split(" ", 1)[0]) stops_words = list(filter(lambda x: x != "\n" and x != "", stops_words)) words = [] for j in ["Subreddit", "Text", "Title"]: for i in results["aggregations"][j]["buckets"]: if (i["key"] not in stops_words and i["key"] not in words): words.append(i["key"]) results = es.search( index="reddit-mentalhealth", body={"query": { "query_string": { "query": ' OR '.join(words), } }}) json_data = [] for element in results['hits']['hits']: data = {} element = element['_source'] data['selftext'] = element['selftext'] data['title'] = element['title'] data['subreddit'] = element['subreddit'] json_data.append(data) if len(json_data) != 0: util.serializer(json_data, 'Ejercicio2.json') else: print("No hay datos para guardar.\n")