Ejemplo n.º 1
0
def div_translation_count(lang):
    " Returns the count of translations per subdivision "
    body = {
        "aggregations": {
            "div_uids": {"terms": {"field": "division", "size": 0}},  # Unlimited
            "subdiv_uids": {"terms": {"field": "subdivision", "size": 0}},  # Unlimited
        }
    }
    try:
        result = es.search(index=lang, doc_type="text", search_type="count", body=body)
    except elasticsearch.exceptions.NotFoundError:
        return None
    mapping = {d["key"]: d["doc_count"] for d in result["aggregations"]["subdiv_uids"]["buckets"]}

    # If division and subdiv is shared, clobber with div value.
    mapping.update({d["key"]: d["doc_count"] for d in result["aggregations"]["div_uids"]["buckets"]})
    return mapping
Ejemplo n.º 2
0
def sutta_search(**kwargs):
    mode = kwargs.get("mode") or "wildcard"
    fields = {
        "name": {"mode": mode},
        "volpage": {"mode": mode, "fields": ["volpage", "volpage_extra"]},
        "acronym": {"mode": mode, "field": "uid"},
        "division": {"mode": mode, "fields": ["division", "subdivision"]},
        "lang": {"mode": "term"}
    }
    if "acronym" in kwargs:
        kwargs["acronym"] = kwargs["acronym"].lower().replace(' ', '')
    
    queries = []
    
    for field, params in fields.items():
        value = kwargs.get(field)            
        if not value:
            continue
        if "fields" in params:
            sub_query = {
                "bool": {
                    "should": [
                            {
                                params["mode"]: {
                                    sub_field: {
                                        "value": value.lower()
                                    }
                                }
                            }
                        for sub_field in params["fields"]
                    ]
                }
            }
            queries.append(sub_query)                   
            
        else:
            queries.append(
                {
                    params["mode"]: {
                        params.get("field", field): {
                            "value": value.lower()
                        }
                    }
                }
            )
    
    if not queries:
        return None
    
    body = {
        "size": int(kwargs.get("limit", 25)),
        "from": int(kwargs.get("offset", 0)),
        "query": {
            "bool": {
                "must": queries
            }
        },
        "sort": [
            { "_score": { "order": "desc" }},
            { "ordering": {"order": "asc" }}
        ]
    }
    
    return es.search(index="suttas", body=body)
Ejemplo n.º 3
0
def search(uid):
    if not discourse_is_available:
        return None
    uid = uid.lower().split('-')[0]
    body = {
        "query": {
            "function_score": {
                "query": {
                    "bool": {
                        "should": [
                            {
                                "has_child": {
                                    "type": "post",
                                    "score_mode": "sum",
                                    "query": {
                                        "match": {
                                            "plain": uid
                                        }
                                    },
                                    "inner_hits": {
                                        "size": 1,
                                        # "_source": ["post_number", "id"],
                                        "highlight": {
                                            "fields": {
                                                "plain": {}
                                            }
                                        }
                                    }
                                }
                            },
                            {
                                "match": {
                                    "tags": uid
                                }
                            },
                            {
                                "match": {
                                    "title": uid
                                }
                            }
                        ]
                    }
                },
                "boost_mode":
                "sum",
                "functions": [{
                    "gauss": {
                        "updated_at": {
                            "origin": "now",
                            "scale": "21d",
                            "offset": "7d",
                            "decay": 0.5
                        }
                    },
                    "weight": 2
                }]
            }
        }
    }
    import json
    result = es.search(discourse_index, doc_type='topic', body=body)
    hits = result['hits']['hits']
    out = {'topics': [], 'categories': {}}

    for hit in hits:
        inner_hits = hit['inner_hits']['post']['hits']['hits']
        if inner_hits:
            inner_hit = inner_hits[0]['_source']
            snippet = ' … '.join(inner_hits[0]['highlight']['plain'])
        else:
            first_post_query = {
                "size": 1,
                "query": {
                    "term": {
                        "topic_id": hit['_source']['id']
                    }
                },
                "sort": [{
                    "post_number": {
                        "order": "asc"
                    }
                }]
            }

            r = es.search(discourse_index,
                          doc_type='post',
                          body=first_post_query)
            inner_hits = r['hits']['hits']
            if inner_hits:
                inner_hit = inner_hits[0]['_source']
                snippet = make_snippet(inner_hit['plain'])
            else:
                inner_hit = None
                snippet = ''
        source = hit['_source']
        out['topics'].append({
            'topic_id':
            source['id'],
            'post_number':
            inner_hit['post_number'] if inner_hit else None,
            'title':
            source['title'],
            'category_id':
            source['category_id'],
            'snippet':
            snippet
        })
        if source['category_id'] not in out['categories']:
            cat = get_category(source['category_id'])
            out['categories'][cat['id']] = cat
            parent_id = cat['parent_category_id']
            if parent_id and parent_id not in out['categories']:
                parent_cat = get_category(parent_id)
                out['categories'][parent_cat['id']] = parent_cat
    return out
Ejemplo n.º 4
0
def search(uid):
    if not discourse_is_available:
        return None
    uid = uid.lower()
    body = {
      "query": {
        "function_score": {
          "query": {
            "bool": {
              "should": [
                {
                  "has_child": {
                    "type": "post",
                    "score_mode": "sum",
                    "query": {
                      "match": {
                        "plain": uid
                      }
                    },
                    "inner_hits": {
                        "size": 1,
                        "_source": ["post_number", "id"],
                        "highlight": {
                            "fields": {
                                "plain": {}
                            }
                        }
                    }
                  }
                },
                {
                  "match": {
                    "tags": uid
                  }
                },
                {
                  "match": {
                    "title": uid
                  }
                }
              ]
            }
          },
          "boost_mode": "sum",
          "functions": [
            {
              "gauss":{
                "updated_at": {
                    "origin": "now",
                    "scale": "21d",
                    "offset": "7d",
                    "decay":  0.5
                }
              },
              "weight": 2
            }
          ]
        }
      }
    }
    import json
    result = es.search(discourse_index, doc_type='topic', body=body)
    hits = result['hits']['hits']
    out = {
        'topics': [],
        'categories': {}
    }
    
    for hit in hits:
        inner_hits = hit['inner_hits']['post']['hits']['hits']
        if inner_hits:
            inner_hit = inner_hits[0]['_source']
            snippet = ' … '.join(inner_hits[0]['highlight']['plain'])
        else:
            first_post_query = {
                                  "size": 1,
                                  "query": {
                                    "filtered": {
                                      "filter": {
                                        "term": {
                                          "topic_id": hit['_source']['id']
                                        }
                                      }
                                    }
                                  },
                                  "sort": [
                                    {
                                      "post_number": {
                                        "order": "asc"
                                      }
                                    }
                                  ]
                                }
                
            r = es.search(discourse_index,
                                  doc_type='post',
                                  body=first_post_query)
            inner_hits = r['hits']['hits']
            if inner_hits:
                inner_hit = inner_hits[0]['_source']
                snippet = make_snippet(inner_hit['plain'])
            else:
                inner_hit = None
                snippet = ''
        source = hit['_source']
        out['topics'].append({
            'topic_id': source['id'],
            'post_number': inner_hit['post_number'] if inner_hit else None,
            'title': source['title'],
            'category_id': source['category_id'],
            'snippet': snippet
        })
        if source['category_id'] not in out['categories']:
            cat = get_category(source['category_id'])
            out['categories'][cat['id']] = cat
            parent_id = cat['parent_category_id']
            if parent_id and parent_id not in out['categories']:
                parent_cat = get_category(parent_id)
                out['categories'][parent_cat['id']] = parent_cat
    return out
Ejemplo n.º 5
0
def search(query, highlight=True, offset=0, limit=10, lang=None, define=None, details=None, **kwargs):
    query.strip()
    indexes = []
    if details is not None:
        indexes = ["suttas"]
    if define is not None:
        indexes.append("en-dict")
    if lang:
        indexes.append(lang)

    if not indexes:
        indexes = ["en", "pi", "suttas", "en-dict"]

    index_string = ",".join(get_available_indexes(indexes))

    fields = [
        "content",
        "content.*^0.5",
        "term^1.5",
        "term.*^0.5",
        "gloss^1.5",
        "lang^0.5",
        "author^0.5",
        "uid",
        "uid.division^0.7",
        "name^1.25",
        "name.*^0.75",
        "heading.title^0.5",
        "heading.title.plain^0.5",
        "heading.title.shingle^0.5",
    ]

    if regex.search(r'[:"~*]', query) or regex.search(r"AND|OR|NOT", query):
        query = query.replace("define:", "term:")
        inner_query = {"query_string": {"fields": fields, "query": query, "use_dis_max": True}}
    else:
        inner_query = {"multi_match": {"type": "best_fields", "tie_breaker": 0.3, "fields": fields, "query": query}}

    body = {
        "from": offset,
        "size": limit,
        "_source": ["uid", "lang", "name", "volpage", "gloss", "term", "heading", "is_root"],
        "timeout": "15s",
        "query": {
            "function_score": {
                "query": inner_query,
                "functions": [
                    {"boost_factor": "1.2", "filter": {"term": {"lang": "en"}}},
                    {"field_value_factor": {"field": "boost", "factor": 1}},
                    {"boost_factor": "0.25", "filter": {"type": {"value": "definition"}}},
                    {"boost_factor": "2", "filter": {"term": {"uid": query.replace(" ", "").lower()}}},
                    {"boost_factor": "1.2", "filter": {"term": {"is_root": True}}},
                ],
                "score_mode": "multiply",
            }
        },
    }
    import json

    print(json.dumps(body, indent=2))

    if highlight:
        body["highlight"] = {
            "pre_tags": ['<strong class="highlight">'],
            "post_tags": ["</strong>"],
            "order": "score",
            "require_field_match": False,
            "fields": {
                "content": {
                    "matched_fields": ["content", "content.folded", "content.stemmed"],
                    "type": "fvh",
                    "fragment_size": 100,
                    "number_of_fragments": 3,
                    "no_match_size": 250,
                }
            },
        }

    return es.search(index=index_string, body=body)
Ejemplo n.º 6
0
def search(query,
           highlight=True,
           offset=0,
           limit=10,
           lang=None,
           define=None,
           details=None,
           **kwargs):
    query.strip()
    indexes = []
    if details is not None:
        indexes = ['suttas']
    if define is not None:
        indexes.append('en-dict')
    if lang:
        indexes.append(lang)

    if not indexes:
        indexes = ['en', 'pi', 'suttas', 'en-dict']

    index_string = ','.join(get_available_indexes(indexes))

    fields = [
        "content", "content.*^0.5", "term^1.5", "term.*^0.5", "gloss^1.5",
        "lang^0.5", "author^0.5", "uid", "uid.division^0.7", "name^1.25",
        "name.*^0.75", "heading.title^0.5", "heading.title.plain^0.5",
        "heading.title.shingle^0.5"
    ]

    if (regex.search(r'[:"~*]', query) or regex.search(r'AND|OR|NOT', query)):
        query = query.replace('define:', 'term:')
        inner_query = {
            "query_string": {
                "fields": fields,
                "query": query,
                "use_dis_max": True
            }
        }
    else:
        inner_query = {
            "multi_match": {
                "type": "best_fields",
                "tie_breaker": 0.3,
                "fields": fields,
                "query": query
            }
        }

    body = {
        "from":
        offset,
        "size":
        limit,
        "_source": [
            "uid", "lang", "name", "volpage", "gloss", "term", "heading",
            "is_root"
        ],
        "timeout":
        "15s",
        "query": {
            "function_score": {
                "query":
                inner_query,
                "functions": [{
                    "weight": "1.2",
                    "filter": {
                        "term": {
                            "lang": "en"
                        }
                    }
                }, {
                    "field_value_factor": {
                        "field": "boost",
                        "factor": 1.0,
                        "missing": 1.0
                    }
                }, {
                    "weight": "0.25",
                    "filter": {
                        "type": {
                            "value": "definition"
                        }
                    }
                }, {
                    "weight": "2",
                    "filter": {
                        "term": {
                            "uid": query.replace(' ', '').lower()
                        }
                    }
                }, {
                    "weight": "1.2",
                    "filter": {
                        "term": {
                            "is_root": True
                        }
                    }
                }],
                "score_mode":
                "multiply"
            }
        }
    }
    import json
    print('searching index: {}'.format(index_string))
    print(json.dumps(body, indent=2))

    if highlight:
        body["highlight"] = {
            "pre_tags": ["<strong class=\"highlight\">"],
            "post_tags": ["</strong>"],
            "order": "score",
            "require_field_match": False,
            "fields": {
                "content": {
                    "matched_fields":
                    ["content", "content.folded", "content.stemmed"],
                    "type":
                    "fvh",
                    "fragment_size":
                    100,
                    "number_of_fragments":
                    3,
                    "no_match_size":
                    250
                }
            }
        }

    return es.search(index=index_string, body=body)
Ejemplo n.º 7
0
def sutta_search(**kwargs):
    mode = kwargs.get("mode") or "wildcard"
    fields = {
        "name": {
            "mode": mode
        },
        "volpage": {
            "mode": mode,
            "fields": ["volpage", "volpage_extra"]
        },
        "acronym": {
            "mode": mode,
            "field": "uid"
        },
        "division": {
            "mode": mode,
            "fields": ["division", "subdivision"]
        },
        "lang": {
            "mode": "term"
        }
    }
    if "acronym" in kwargs:
        kwargs["acronym"] = kwargs["acronym"].lower().replace(' ', '')

    queries = []

    for field, params in fields.items():
        value = kwargs.get(field)
        if not value:
            continue
        if "fields" in params:
            sub_query = {
                "bool": {
                    "should": [{
                        params["mode"]: {
                            sub_field: {
                                "value": value.lower()
                            }
                        }
                    } for sub_field in params["fields"]]
                }
            }
            queries.append(sub_query)

        else:
            queries.append({
                params["mode"]: {
                    params.get("field", field): {
                        "value": value.lower()
                    }
                }
            })

    if not queries:
        return None

    body = {
        "size": int(kwargs.get("limit", 25)),
        "from": int(kwargs.get("offset", 0)),
        "query": {
            "bool": {
                "must": queries
            }
        },
        "sort": [{
            "_score": {
                "order": "desc"
            }
        }, {
            "ordering": {
                "order": "asc"
            }
        }]
    }

    return es.search(index="suttas", body=body)