Example #1
0
def autofocus(parse_test):
    # Assume that we have already parsed and have a good database.
    fields = ['tag_group', 'public_tag_name', 'tag_class', 'description', 'tag']
    # Try processing 10 times. That's probably enough.
    # for _ in range(10):
    process_domains()

    processed_search = Search(index=f"content_{parse_test.version}").query('match', processed=2)
    processed_search.execute()
    num_processed = 0
    for hit in processed_search.scan():
        # Check that each of these specific cases have some information in the database.
        for field in fields:
            assert hit[field] is not None, f"Domain {hit['domain']} missing field {field}."
    num_processed += processed_search.count()['value']
    partly_processed_search = Search(index=f"content_{parse_test.version}").query('match', processed=1)
    num_processed += partly_processed_search.count()

    # Count non-generic domains (the domains which should have been processed).
    non_generic_search = Search(index=f"content_{parse_test.version}").exclude('term', header__keyword='generic')
    num_non_generic = non_generic_search.count()

    # Check to see what percentage of the domains have processed.
    logging.info(f"Processed {num_processed} out of {num_non_generic}.")
    percent_processed = float(num_processed) / float(num_non_generic)
    logging.info(f"Processed {percent_processed*100}% of domains.")
    assert percent_processed >= parse_test.percent_processed, (f"Processed only {percent_processed*100}% "
                                                               f"of domains, not {parse_test.percent_processed*100}%.")
Example #2
0
def queryTopic(topic, days=None, sent=None):

    s = Search(using=client, index=collectionName)

    if days and days != "-1":

        days = int(days)
        now = int(time.time())
        last = now - days * day

        s = s.filter('range', date={'gte': last, 'lte': now})

    if sent and sent != "-1":
        s = s.filter('term', class_code=int(sent))

    q = Q('query_string', query=topic, default_field="topics.keyword")
    s = s.query(q)

    if s.count() > 100:
        total = 100
    else:
        total = s.count()

    s = s[0:total]
    results = s.execute()

    return results
Example #3
0
    def handle(self, *args, **options):
        # Инициализация клиента ElasticSearch
        es = Elasticsearch(settings.ELASTIC_HOST,
                           timeout=settings.ELASTIC_TIMEOUT)

        # Заявки на изобретения, у которых есть I_43.D
        query = Q(
            "match",
            Document__idObjType=1,
        ) & ~Q('query_string', query="_exists_:Claim.I_11") & Q(
            'query_string',
            query="_exists_:Claim.I_43.D AND NOT _exists_:Claim.I_43_bul_str")
        s = Search().using(es).query(query)
        c = s.count()
        i = 0
        for h in s.scan():
            body = h.to_dict()

            i += 1
            print(f"apps: {i}/{c} - {body['Claim']['I_21']} - {h.meta.id}")

            i_43_d = body['Claim']['I_43.D'][0]
            bulletin = ClListOfficialBulletinsIp.objects.get(bul_date=i_43_d)
            bull_str = f"{bulletin.bul_number}/{bulletin.bul_date.year}"
            body['Claim']['I_43_bul_str'] = bull_str
            es.index(index=settings.ELASTIC_INDEX_NAME,
                     doc_type='_doc',
                     id=h.meta.id,
                     body=body,
                     request_timeout=30)

        # Патенты на изобретения, полезные модели, у которых есть I_45.D
        query = Q(
            'query_string',
            query="_exists_:Patent.I_45.D AND NOT _exists_:Patent.I_45_bul_str"
        )
        s = Search().using(es).query(query)
        c = s.count()
        i = 0
        for h in s.scan():
            body = h.to_dict()

            i += 1
            print(
                f"pr. docs: {i}/{c} - {body['Patent']['I_21']} - {h.meta.id}")

            i_45_d = body['Patent']['I_45.D'][len(body['Patent']['I_45.D']) -
                                              1]
            bulletin = ClListOfficialBulletinsIp.objects.get(bul_date=i_45_d)
            bull_str = f"{bulletin.bul_number}/{bulletin.bul_date.year}"
            body['Patent']['I_45_bul_str'] = bull_str
            es.index(index=settings.ELASTIC_INDEX_NAME,
                     doc_type='_doc',
                     id=h.meta.id,
                     body=body,
                     request_timeout=30)

        self.stdout.write(self.style.SUCCESS('Finished'))
def stats_processor(request):
    s = Search(index=CATALOG_INDICES)
    # res = s.params(search_type="count").aggs.metric(
    #     "distinct_names", "cardinality", field="full_name").execute()

    return {
        'total_declarations': s.count(),
        'total_persons': s.count()  # res.aggregations.distinct_names.value
    }
Example #5
0
    def get(self, request):
        def dynamics_list(d):
            res = [0 if d[i] == 0 else int((d[i + 1] - d[i]) / d[i] * 100) for i in range(len(d) - 1)]
            res.insert(0, 0)
            return res

        date_range = [request.query_params.get('date_from') + '-01', request.query_params.get('date_to') + '-01']
        date_range = [date(int(a[:4]), int(a[5:7]), 0o01) for a in date_range]
        type = request.query_params.get('category')
        s = Search(index='statistic').params(request_timeout=100)
        tnved = request.query_params.get('tnved')
        start_tnved_list = int(request.query_params.get('start'))
        length_tnved_list = int(request.query_params.get('length'))
        label_list = []
        netto_list = []
        stoim_list = []
        if not tnved:
            two_tnved_list_request = StatisticDataDocument.search()
            two_tnved_list_request.aggs.bucket('a', 'terms', field='tnved_two', size=200)
            result = two_tnved_list_request.execute()
            tnved_two_distinct = [item.key for item in result.aggregations.a.buckets]
            tnved_two_distinct.reverse()
            for i in tnved_two_distinct[start_tnved_list:start_tnved_list + length_tnved_list]:
                s.query = Q('bool', must=[Q('match', napr=type),
                                          Q('match', tnved_two=i),
                                          Q('range', period={'gte': date_range[0], 'lt': date_range[1]})])
                s.aggs.metric('stoim', 'sum', field='stoim')
                s.aggs.metric('netto', 'sum', field='netto')
                result = s[:s.count()].execute().aggregations
                label_list.append(i)
                netto_list.append(result['netto']['value'])
                stoim_list.append(result['stoim']['value'])
        else:
            tnved_distinct = [i[tnved_dict[len(tnved) + 2]] for i in
                              StatisticData.objects.filter(**{tnved_dict[len(tnved)]: tnved}).values(
                                  tnved_dict[len(tnved) + 2]).distinct()]
            for i in tnved_distinct[start_tnved_list:start_tnved_list + length_tnved_list]:
                tnved_query_field = {tnved_dict[len(tnved) + 2]: i}
                s.query = Q('bool', must=[Q('match', napr=type),
                                          Q('match', **tnved_query_field),
                                          Q('range', period={'gte': date_range[0], 'lt': date_range[1]})])
                s.aggs.metric('stoim', 'sum', field='stoim')
                s.aggs.metric('netto', 'sum', field='netto')
                result = s[:s.count()].execute().aggregations
                label_list.append(i)
                netto_list.append(result['netto']['value'])
                stoim_list.append(result['stoim']['value'])
        context = {
            'labels': label_list,
            'netto': [netto_list, dynamics_list(netto_list)],
            'cost': [stoim_list, dynamics_list(stoim_list)]
        }
        return JsonResponse(context)
 def _get_es_conn(self):
     try:
         kwargs = dict(
             hosts=['localhost'],
             port=9200,
             use_ssl=False,
         )
         CLIENT = Elasticsearch(**kwargs)
         es_conn = Search(using=CLIENT, index="geonames")
         es_conn.count()
         return es_conn
     except:
         raise ConnectionError("Error establishing connection with ES container")
Example #7
0
def admin_loc(lat, long):
    geonameid = id_lat_long(lat, long)
    request1 = 'SELECT admin1_code, admin2_code, admin3_code, admin4_code FROM geoname WHERE "geonameid"=%s'
    session1 = cluster.execute(request1, [int(geonameid)])
    colonne = session1.one()
    result = []
    if colonne.admin4_code is not None:
        result = [
            colonne.admin1_code, colonne.admin2_code, colonne.admin3_code,
            colonne.admin4_code
        ]
    else:
        if colonne.admin3_code is not None:
            result = [
                colonne.admin1_code, colonne.admin2_code, colonne.admin3_code
            ]
        else:
            if colonne.admin2_code is not None:
                result = [colonne.admin1_code, colonne.admin2_code]
            else:
                result = [colonne.admin1_code]

    search = Search(index="geoname").using(client)
    search = search.query('match', admin1_code=result[0])

    geoname_tab = []
    it = 0
    if search.count() > 1000:
        it = 1000
    else:
        it = search.count()
    for hit in search[0:it]:
        coords_1 = (lat, long)
        coords_2 = (hit.latitude, hit.longitude)
        geoname_tab.append({
            'geonameid': hit.geonameid,
            'asciiname': hit.asciiname,
            'latitude': hit.latitude,
            'longitude': hit.longitude,
            'distance': distance.geodesic(coords_1, coords_2).km
        })

    geoname_tab = sorted(geoname_tab, key=lambda k: k['distance'])
    return geoname_tab


#
# for ti in admin_loc(43.82512, 1.72382)[0:10]:
#     print(ti)
Example #8
0
def get_usernames_for_crawl():
    ms = MultiSearch(index='populars')
    q = Q({"bool": {"must_not": {"exists": {"field": "last_update"}}}})
    never_updated = Search().query(q)
    total = never_updated.count()
    never_updated = never_updated[0:total]
    old_updated = Search().query('range', last_update={"lte": "now-2d"})
    total = old_updated.count()
    old_updated = old_updated[0:total]
    ms = ms.add(never_updated)
    ms = ms.add(old_updated)
    responses = ms.execute()
    for res in responses:
        for hit in res:
            yield (hit.username)
Example #9
0
def get_articles(w, keywords, index):
    """
    Get articles details from ES. Return a sigle DOI
    :w: str, concept name
    :keywords: str, ontology keywords
    :index: str
    :return: list of dict
    """
    review_list = [("review", 1), ("feature article", .5)
                   #        ("survey", .5)
                   ]

    should_k = [
        Q('multi_match',
          query=rw[0],
          fields=['title', "abstract"],
          boost=rw[1]) for rw in review_list if rw[0] != w
    ]

    if keywords:
        keywords = [kw for kw in keywords.split(',') if kw != w]
        should_k += [Q('multi_match', query=kw, boost=2) for kw in keywords]

    query = Q('bool',
              must=[Q('multi_match', query=w)],
              should=should_k,
              minimum_should_match=1 if keywords else None)

    request = Search(index=index)
    request = request.query(query)
    request = request.sort("_score")
    request = request.source([
        'DOI', 'title', 'URL', 'authors', 'abstract', 'provider',
        'provider_id', 'publication_date'
    ])
    request.execute()

    if request.count() < 10000:
        results = [hit.to_dict() for hit in request[:request.count()]]
    else:
        results = [hit.to_dict() for hit in request[:9999]]

    for hit in results:
        hit['DOI'] = get_url(hit['DOI']) if hit.get('DOI') else hit.get('URL')
        hit['review'] = any(rw[0] in hit.get('title', '').lower()
                            for rw in review_list)

    return results
Example #10
0
    def documents_by_text(self, grouped_targets: dict, queries: list,
                          from_index: int, size: int) -> tuple:
        """
        Paginated documents found by text.
        """
        # For pagination/score sorting to work, we need to query all the different corpus indices in the same
        # Elasticsearch query.
        # We are using the grouped target approach like search documents by annotations, event though buckets
        # are inconsequential for text search.
        indices = self.target_text_document_indices(grouped_targets)
        indices_argument = ','.join(indices)

        language_manager = get_language_manager()
        match_queries = [
            to_match_query(language_manager, query) for query in queries
        ]
        grouped_queries = self.group_queries_by_operator(match_queries)

        # A query language restriction, if present, will work automatically via the query text.<language> mapping.
        es = get_es_conn()
        search = Search(using=es, index=indices_argument)
        search = search.source(["title", "language", "source"])

        search.query = Q('bool',
                         must=grouped_queries["must"],
                         must_not=grouped_queries["must_not"],
                         should=grouped_queries["should"])

        search = search[from_index:from_index + size]
        count = search.count()
        documents = [self.map_hit_with_score(hit) for hit in search]

        return count, documents
def get_company_by_paizhao_name(paizhao_name):
    companys_search = Search().using(es).index('paizhao').query('match', paizhao_name=paizhao_name)
    count_num = companys_search.count()
    print(count_num)
    companys = companys_search[0:count_num]
    for one_company in companys:
        print(one_company.jigouquancheng)
    def _getCount(self):
        client = Elasticsearch()
        client.indices.refresh(index='gracc.osg.summary*')
        s = Search(using=client, index='gracc.osg.summary*') \
        .filter('range', **{'EndTime': {'from': '2016-01-01', 'to': '2017-01-01'}})

        return s.count()
Example #13
0
    def retrieve_gurus_tweets(self, current_year=True):
        """
        Returns a dictionary containing all tweets for each guru in elasticsearch for a timewindow
        :return: dictionary containing all tweets for each guru in elasticsearch for a timewindow
        """
        gurus_dict = dict()
        gurus_entries = Search(
            using=self.client, index=self.GURUS_INDEX
        ).source(
            include=["body.text", "body.user.screen_name", "body.created_at"])
        if current_year:
            gurus_entries = gurus_entries.filter(
                'bool',
                must={
                    'query_string': {
                        'default_field': 'body.created_at',
                        "query": f"""*{gmtime().tm_year}"""
                    }
                })

        for entry in gurus_entries[0:gurus_entries.count()].scan():
            user = entry.body["user"]["screen_name"]
            text = entry.body["text"]
            if user in gurus_dict:
                gurus_dict[user].append(text)
            else:
                gurus_dict[user] = list()

        return gurus_dict
Example #14
0
def load_filtered_top_ko_mutations_genes(filters, start=0, size=50):
    """Retrieves top genes according to number of KO mutations and filter them through the tickable options"""
    # First aggregate over associations
    s = Search(using=es, doc_type='ko_associations')
    if 'chr' in filters and len(filters['chr']) > 0 and len(filters['chr']) < 5:
        s = s.filter(Q('bool', should=[Q({'nested':{'path':'gene', 'query':{'match':{'gene.chr':chrom if len(chrom) > 3 else 'chr%s' % chrom}}}}) for chrom in
                                       filters['chr']]))
    if 'significant' in filters:
        s = s.filter(Q('range', mac={'gte': 6}))
        s = s.filter('term', overBonferroni='T') # TODO: change this to permutation once the new indexed scores are in.

    agg = A("terms", field="gene.id", size='33341') # Need to check ALL GENES for further lists
    s.aggs.bucket('genes', 'nested', path='gene').bucket('gene_count', agg) # Need to have a NESTED query
    top_genes = s.execute().aggregations.genes.gene_count.buckets
    # The KO associations are already retrieved, just need to assign them to the right gene.
    association_dict = defaultdict(list)
    for asso in s[0:s.count()].execute().to_dict()['hits']['hits']:
        association_dict[asso['_source']['gene']['name']].append(asso['_source'])
    genes = []
    for top in top_genes[start:start+size]:
        id = top['key']
        matches = GENE_ID_PATTERN.match(id)
        if not matches:
            continue
        gene = load_gene_by_id(top['key'])
        gene['n_hits'] = top['doc_count']
        gene['ko_associations'] = association_dict[top['key']]
        genes.append(gene)
    return genes, len(top_genes)
Example #15
0
def get_all():
    # This route is a temporary hack for the WMS service. API Key is not real security, just something to try to prevent random bots from getting a response
    s = Search(using=client, index='imagery', doc_type="metadata")
    total = s.count()
    s = s[0:total]
    response = s.execute()
    return response.to_dict()
Example #16
0
def get_all():
    client = Elasticsearch()
    s = Search(using=client, index='courses')
    count = s.count()
    result = s[0:count].execute()['hits']['hits']
    result = sorted(result, key=lambda x: int(x["_id"]))
    return result
Example #17
0
def search_all(es_conn, index):
    responses = Search(using=es_conn, index=INDEX_NAMES[index]).query(
        "match", code=DEFAULT_QUERIES[index])
    count = responses.count()
    result = responses[0:count].execute().to_dict()
    res = {'response': result['hits']['hits']}
    return JsonResponse(res)
Example #18
0
def test_record_page(app, db, es, event_queues, full_record):
    """Test record page views."""
    full_record['conceptdoi'] = '10.1234/foo.concept'
    full_record['conceptrecid'] = 'foo.concept'
    r = Record.create(full_record)
    PersistentIdentifier.create(
        'recid', '12345', object_type='rec', object_uuid=r.id,
        status=PIDStatus.REGISTERED)
    db.session.commit()

    with app.test_client() as client:
        record_url = url_for('invenio_records_ui.recid', pid_value='12345')
        assert client.get(record_url).status_code == 200

    process_events(['record-view'])
    current_search.flush_and_refresh(index='events-stats-record-view')

    search = Search(using=es, index='events-stats-record-view')
    assert search.count() == 1
    doc = search.execute()[0]
    assert doc['doi'] == '10.1234/foo.bar'
    assert doc['conceptdoi'] == '10.1234/foo.concept'
    assert doc['recid'] == '12345'
    assert doc['conceptrecid'] == 'foo.concept'
    assert doc['resource_type'] == {'type': 'publication', 'subtype': 'book'}
    assert doc['access_right'] == 'open'
    assert doc['communities'] == ['zenodo']
    assert doc['owners'] == [1]
Example #19
0
def test_file_download(app, db, es, event_queues, record_with_files_creation):
    """Test file download views."""
    recid, record, _ = record_with_files_creation
    record['conceptdoi'] = '10.1234/foo.concept'
    record['conceptrecid'] = 'foo.concept'
    record.commit()
    db.session.commit()

    with app.test_client() as client:
        file_url = url_for(
            'invenio_records_ui.recid_files',
            pid_value=recid.pid_value,
            filename='Test.pdf',
        )
        assert client.get(file_url).status_code == 200

    process_events(['file-download'])
    current_search.flush_and_refresh(index='events-stats-file-download')

    search = Search(using=es, index='events-stats-file-download')
    assert search.count() == 1
    doc = search.execute()[0]
    assert doc['doi'] == '10.1234/foo.bar'
    assert doc['conceptdoi'] == '10.1234/foo.concept'
    assert doc['recid'] == '12345'
    assert doc['conceptrecid'] == 'foo.concept'
    assert doc['resource_type'] == {'type': 'publication', 'subtype': 'book'}
    assert doc['access_right'] == 'open'
    assert doc['communities'] == ['zenodo']
    assert doc['owners'] == [1]
Example #20
0
def get_stock_quotes_hist_from_es(symbol: str = 'LPG',
                                  es: elasticsearch.Elasticsearch = ELS_CLIENT
                                  ):
    """
    Search Elasticsearch for the quotes.
    :param symbol:
    :param es:
    :return:
    """

    init_stock_quotes_hist_idx(es)
    s = Search(using=es, index="stock_quotes_hist") \
        .filter("term", symbol=symbol) \
        .sort({"as_of_date": {"order": "asc"}}) \
        .params(request_timeout=300)

    if 0 == s.count():
        return
    else:
        for hit in s.scan():
            yield {
                "symbol": hit.symbol,
                "as_of_date": hit.as_of_date,
                "close": hit.close,
                "volume": hit.volume
            }
Example #21
0
def read_processed_events(client, start_dt, end_time, index):

    start_time = start_dt.strftime('%Y-%m-%dT%H:%M:%S')
    print('Quering for data starting with ' + start_time)

    s = Search(using=client, index=index) \
               .filter('range', ** {'@timestamp': {'gt': start_time, 'lt':end_time}})

    s = s[:s.count()]
    if len(list(s)) == 0:
        return []
    else:
        print(len(list(s)))

    try:
        response = s.execute()
        if not response.success():
            raise
    except Exception as e:
        print(e, 'Error accessing Elasticsearch')
        sys.exit(1)
    data = []
    for entry in response.to_dict()['hits']['hits']:
        data.append(entry['_source'])

    return data
Example #22
0
def retrieve_elastic(d_end,
                     d_start,
                     ap_mac="",
                     ue_mac="",
                     elastic_host="localhost:9200"):
    client = Elasticsearch(
        [elastic_host],
        scheme="http",
        port=9200,
    )
    s = Search(using=client, index="probe_clients")
    # querry_time = 1600
    # d_end = datetime.datetime.now()
    # d_start = d_end - datetime.timedelta(minutes=querry_time)
    if ap_mac != "":
        s = s.query("match", ap_mac=ap_mac)
    if ue_mac != "":
        s = s.query("match", mac=ue_mac)
    s = s.filter('range', timestamp={
        'gte': d_start,
        'lt': d_end
    }).sort('-timestamp')
    total = s.count()
    s = s[0:total]
    response = s.execute()
    # print(len(response))
    # for hit in response:
    #     print("ap_mac: {}, mac: {}, rssi: {}, time: {}".format(hit.ap_mac, hit.mac, hit.rssi, hit.timestamp))
    return response
Example #23
0
async def check_es(request):
    """检测es链接是否正常"""
    domain_name = request.json.get("domain")
    time_out = 300
    try:
        es = Elasticsearch(ES_HOST[0],
                           http_auth=(ES_HOST[1], ES_HOST[2]),
                           timeout=time_out)
        time_point = time.mktime(datetime.datetime.now().replace(
            second=0, microsecond=0).timetuple())
        lostashindextimestamp = int(time_point - 300)
        date_obj = datetime.datetime.fromtimestamp(
            lostashindextimestamp) - datetime.timedelta(hours=8)
        logstashindex = '%s%s' % ('domainip-',
                                  date_obj.strftime("%Y.%m.%d.%H"))
        filt = Q("match", msecRegion=lostashindextimestamp) & Q(
            "match", domain=domain_name)
        s = Search(using=es, index=logstashindex).query(filt)
        response = s.count()
        return_code = 0
        if response > 0:
            result = 0  # 已经配置es
        else:
            result = 1  # 未配置es
    except Exception as e:
        logger.error(f'check es: {e}')
        return_code = -1
        result = -1  # 系统错误
    ret = {"es_stat": result, "return_code": return_code}
    return json(ret)
Example #24
0
    def es_read(self, log_id, offset):
        """
        Returns the logs matching log_id in Elasticsearch and next offset.
        Returns '' if no log is found or there was an error.
        :param log_id: the log_id of the log to read.
        :type log_id: str
        :param offset: the offset start to read log from.
        :type offset: str
        """

        # Offset is the unique key for sorting logs given log_id.
        s = Search(using=self.client) \
            .query('match_phrase', log_id=log_id) \
            .sort('offset')

        s = s.filter('range', offset={'gt': offset})

        logs = []
        if s.count() != 0:
            try:

                logs = s[self.MAX_LINE_PER_PAGE * self.PAGE:self.MAX_LINE_PER_PAGE] \
                    .execute()
            except Exception as e:
                msg = 'Could not read log with log_id: {}, ' \
                      'error: {}'.format(log_id, str(e))
                self.log.exception(msg)

        return logs
Example #25
0
def search_by_date(time_begin,
                   time_end,
                   article_id=None,
                   agent=None,
                   country=None,
                   city=None,
                   poll_id=None,
                   widget=False):
    try:
        client = Elasticsearch([settings.ELASTICSEARCH_URL])
        s = Search(using=client, index="userstat")

        query_list = []
        query_list.append(
            Q('range', timestamp={
                'gte': time_begin,
                'lt': time_end
            }))
        if agent is not None:
            query_list.append(Q('match', agent=agent))
        if country:
            query_list.append(Q('match', country=country))
        if city:
            query_list.append(Q('match', city=city))
        if poll_id:
            query_list.append(Q('match', poll=poll_id))
        if article_id:
            query_list.append(Q('match', article=article_id))
        if widget:
            query_list.append(Q('match', widget=widget))

        query = reduce(operator.and_, query_list)

        poll_created_filter = Q('match', poll_created=True)
        poll_viewed_filter = Q('match', poll_viewed=True)
        poll_answered_filter = Q('match', poll_answered=True)
        user_created_filter = Q('match', user_created=True)

        poll_count = s.query(
            (query & Q('bool', filter=[poll_created_filter]))).count()
        view_count = s.query(
            (query & Q('bool', filter=[poll_viewed_filter]))).count()
        user_count = s.query(
            (query & Q('bool', filter=[user_created_filter]))).count()

        s = s.query((query & Q('bool', filter=[poll_answered_filter])))
        answer_count = s.count()
        a = A('cardinality', field='user')
        s.aggs.bucket('unique_users', a)
        s = s.execute()

        return {
            'poll_count': poll_count,
            'view_count': view_count,
            'answer_count': answer_count,
            'user_count': user_count,
            'respondents_count': s.aggregations.unique_users['value']
        }
    except:
        return None
Example #26
0
    def es_read(self, log_id, offset, metadata):
        """
        Returns the logs matching log_id in Elasticsearch and next offset.
        Returns '' if no log is found or there was an error.
        :param log_id: the log_id of the log to read.
        :type log_id: str
        :param offset: the offset start to read log from.
        :type offset: str
        :param metadata: log metadata, used for steaming log download.
        :type metadata: dict
        """

        # Offset is the unique key for sorting logs given log_id.
        s = Search(using=self.client) \
            .query('match_phrase', log_id=log_id) \
            .sort('offset')

        s = s.filter('range', offset={'gt': int(offset)})
        max_log_line = s.count()
        if 'download_logs' in metadata and metadata['download_logs'] and 'max_offset' not in metadata:
            try:
                metadata['max_offset'] = s[max_log_line - 1].execute()[-1].offset if max_log_line > 0 else 0
            except Exception:
                self.log.exception('Could not get current log size with log_id: {}'.format(log_id))

        logs = []
        if max_log_line != 0:
            try:

                logs = s[self.MAX_LINE_PER_PAGE * self.PAGE:self.MAX_LINE_PER_PAGE] \
                    .execute()
            except Exception as e:
                self.log.exception('Could not read log with log_id: %s, error: %s', log_id, str(e))

        return logs
Example #27
0
 def count(self):
     """
     Returns the number of indexed tweets
     """
     search = Search(using=self.es, index='tweets')
     search.execute()
     return search.count()
    def query_event_ids(self):
        es_query = []
        es_query.append({'match': {'winlog.provider_name': MICROSOFT_WINDOWS_DNSCLIENT_PROVIDER_NAME}})
        query = Q({'bool': {'must': es_query}})
        s = Search(using = self.Client, index = "winlogbeat-*").query(query)
        s.source(includes = ['winlog.provider_name', 'winlog.event_id'])

        count = s.count()
        print("Count: %d" % (count))

        event_ids = {}
        i = 0

        try:
            for hit in s.scan():
                print('%d. %d' % (i, hit.winlog.event_id))
                if not hit.winlog.event_id in event_ids:
                    event_ids[hit.winlog.event_id] = 1
                    print("%s: %d" % (hit.winlog.provider_name, hit.winlog.event_id))
                else:
                    event_ids[hit.winlog.event_id] += 1  
                    
                i += 1
        except:
            traceback.print_exc()
    def es_read(self, log_id, offset):
        """
        Returns the logs matching log_id in Elasticsearch and next offset.
        Returns '' if no log is found or there was an error.
        :param log_id: the log_id of the log to read.
        :type log_id: str
        :param offset: the offset start to read log from.
        :type offset: str
        """

        # Offset is the unique key for sorting logs given log_id.
        s = Search(using=self.client) \
            .query('match', log_id=log_id) \
            .sort('offset')

        s = s.filter('range', offset={'gt': offset})

        logs = []
        if s.count() != 0:
            try:

                logs = s[self.MAX_LINE_PER_PAGE * self.PAGE:self.MAX_LINE_PER_PAGE] \
                    .execute()
            except Exception as e:
                msg = 'Could not read log with log_id: {}, ' \
                      'error: {}'.format(log_id, str(e))
                self.log.exception(msg)

        return logs
def listAllDontCrawled():
    linkscrawledList = []
    try:
        s = Search(using=client, index=INDEX).filter("term", crawled=False)
        count = s.count()
        results = s[0:count].execute()

        idtemp = 0

        for link in results:
            print(link.meta.id, link.url, link.type, link.crawled)
            auxlink = Link()
            auxlink.id = link.meta.id
            auxlink.url = link.url
            auxlink.type = link.type
            auxlink.crawled = link.crawled
            auxlink.text = link.text

            linktmp = searchByIndex(auxlink.id)
            if not linktmp:
                pass
            linktmp.crawled = True
            linktmp.save()

            linkscrawledList.append(auxlink)

        return linkscrawledList
    except:
        return linkscrawledList
Example #31
0
def get_bug_count(doc_type, key):
    payload = {'query': {'bool': {'must': [{'exists': {'field': 'bugs.{0}'.format(key)}}]}}, 'from': 0, 'size': 20,
               'sort': {'published_from': {'order': 'desc'}}}
    s = Search(using=es, index='w12scan', doc_type=doc_type).from_dict(payload)
    res = s.count()

    return res
Example #32
0
def count_inbound_refs(
    es_client: Any,
    release_ident: Optional[str] = None,
    work_ident: Optional[str] = None,
    openlibrary_work: Optional[str] = None,
    url: Optional[str] = None,
    filter_stage: List[str] = [],
    es_index: str = "fatcat_ref",
) -> int:
    """
    Same parameters as get_inbound_refs(), but returns just a count
    """

    search = Search(using=es_client, index=es_index)

    if release_ident:
        search = search.filter("term", target_release_ident=release_ident)
    elif work_ident:
        search = search.filter("term", target_work_ident=work_ident)
    elif openlibrary_work:
        search = search.filter("term",
                               target_openlibrary_work=openlibrary_work)
    else:
        raise ValueError("require a lookup key")

    if filter_stage:
        search = search.filter("term", source_stage=filter_stage)

    return search.count()
Example #33
0
    def es_read(self, log_id: str, offset: str, metadata: dict) -> list:
        """
        Returns the logs matching log_id in Elasticsearch and next offset.
        Returns '' if no log is found or there was an error.

        :param log_id: the log_id of the log to read.
        :param offset: the offset start to read log from.
        :param metadata: log metadata, used for steaming log download.
        """
        # Offset is the unique key for sorting logs given log_id.
        search = Search(using=self.client).query('match_phrase', log_id=log_id).sort(self.offset_field)

        search = search.filter('range', **{self.offset_field: {'gt': int(offset)}})
        max_log_line = search.count()
        if 'download_logs' in metadata and metadata['download_logs'] and 'max_offset' not in metadata:
            try:
                if max_log_line > 0:
                    metadata['max_offset'] = attrgetter(self.offset_field)(
                        search[max_log_line - 1].execute()[-1]
                    )
                else:
                    metadata['max_offset'] = 0
            except Exception:
                self.log.exception('Could not get current log size with log_id: %s', log_id)

        logs = []
        if max_log_line != 0:
            try:

                logs = search[self.MAX_LINE_PER_PAGE * self.PAGE : self.MAX_LINE_PER_PAGE].execute()
            except Exception:
                self.log.exception('Could not read log with log_id: %s', log_id)

        return logs
    def _getCount(self):
        client = Elasticsearch()
        client.indices.refresh(index='gracc.osg.summary*')
        s = Search(using=client, index='gracc.osg.summary*') \
        .filter('range', **{'EndTime': {'from': '2016-01-01', 'to': '2017-01-01'}})
        

        return s.count()
Example #35
0
def search_list(request, kd=None):
    kd = request.GET.get("kd", None)
    if not kd:
        raise Http404
    else:
        kd = kd.strip()
    page = int(request.GET.get("page", "1"))
    if page > PAGE_MAC_SIZE:
        page = PAGE_MAC_SIZE
    elif page < 1:
        page = 1
    spc = request.GET.get("spc", "1")
    city = request.GET.get("city", u"全国")
    pn_count = (page - 1) * PER_COUNT
    tags = {"cities": [], "websites": []}
    page_size = 1
    try:
        search = Search(using=es, index="tuangou", doc_type="meituan").query("match", title=kd).sort('-@timestamp')[
                 pn_count:pn_count + PER_COUNT]
        # s..query(~Q("match", description="beta"))  # description字段不含 beta
        search.aggs.bucket('per_city', 'terms', field='city')  # metric('max_lines', 'max', field='lines')
        search.aggs.bucket('per_website', 'terms', field='website')
        page_size = search.count() / PER_COUNT + 1
        response = search.execute()
        # print search.count()


        # for hit in response:
        #     print dir(hit.meta) # ['doc_type', u'id', u'index', u'score', u'sort']
        #     print dir(hit)

        for tag in response.aggregations.per_city.buckets:
            # print tag.key, tag.doc_count
            tags["cities"].append((tag.key, tag.doc_count))
        # print(tag.key, tag.sum_lines.value)
        for tag in response.aggregations.per_website.buckets:
            # print tag.key, tag.doc_count
            tags["websites"].append((tag.key, tag.doc_count))
    except:
        exception = traceback.format_exc()
        print exception
    host_search = [u"美食", u"酒店", u"机票", u"火车票", u"汽车票"]
    if page_size > PAGE_MAC_SIZE:
        page_size = PAGE_MAC_SIZE
    ct = dict({
        'kd': kd,
        'results': response,
        "spc": spc,
        "city": city,
        "page": page,
        "page_size": page_size,
        "host_search": host_search,

    })
    return render(request, 'search/list.html', ct)
Example #36
0
def GetCountRecords(client, from_date, to_date, query = None):
    """
    Get the number of records (documents) from a date range
    
    :param elasticsearch.client client: The elasticsearch client to use for the search 
    :param str from_date: The from date.  It can be lucene date math, such as 'now-1d' for yesterday
    :param str to_date: The to date.  Can also use lucene date math.
    :query str query: Query string to limit the documents searched.
    :return: The total documents
    """
    s = Search(using=client, index='gracc.osg.raw-*') \
        .filter('range', **{'@timestamp': {'from': from_date, 'to': to_date}})

    response = s.count()
    return response
 def test_periodic_summarizer(self):
     
     # Check the database for new summary records.
     client = Elasticsearch()
     
     # Refresh the indexes
     client.indices.refresh(index='gracc.osg.raw*')
     
     # Restart the graccsumperiodic service 
     subprocess.call("systemctl restart graccsumperiodic.service", shell=True)
     
     # Wait for a bit to make sure the summarizer actually does it's thing
     time.sleep(60)
     
     # Refresh the indexes
     client.indices.refresh(index='gracc.osg.summary*')
     time.sleep(60)
     
     # Search for the summary records
     s = Search(using=client, index='gracc.osg.summary*') \
     .filter('range', **{'EndTime': {'from': 'now-7d', 'to': 'now'}})
     
     num_sum = s.count()
     
     stats = client.cat.indices(index='_all')
     print stats
     
     self.assertGreater(num_sum, 0)
     
     # Search for the summary transfer records
     s = Search(using=client, index='gracc.osg.transfer-summary*') \
     .filter('range', **{'StartTime': {'from': 'now-7d', 'to': 'now'}})
     
     num_sum = s.count()
     
     self.assertGreater(num_sum, 0)
 def test_raw_transfer(self):
     """
     Testing the tester for raw summary data
     """
     # Check the raw indexes for records from the last 7 days
     client = Elasticsearch()
     s = Search(using=client, index='gracc.osg-transfer.raw*') \
     .filter('range', **{'StartTime': {'from': 'now-7d', 'to': 'now'}})
     
     num_raw = s.count()
     
     stats = client.cat.indices(index='_all')
     print stats
     
     self.assertGreater(num_raw, 0)
Example #39
0
    def portalSearch(expression, start=0, end=25):
        client = Elasticsearch()
        ret = {'nodes': [], 'Counts': {}}
        q = Q("bool", must=[Q('match', _all=expression)])
        s = Search(using=client, index="neo4j-inquisite-node", doc_type="Repository,Data").query(q)
        q_total = s.count()
        s = s[0:q_total]
        s = s.highlight_options(require_field_match=False)
        s = s.highlight('*', fragment_size=45)
        res = s.execute()
        data = {}
        uuids = []
        pub_uuids = {}
        if res:
            for r in res:
                d = r.to_dict()
                if r.meta.doc_type == 'Repository':
                    if int(d['published']) == 0:
                        continue
                    repo_id = r.meta.id
                    ret['nodes'].append({"id": r.meta.id, "type": "Repository", "name": d['name'], "description": d['readme']})
                    repo_uuids = SearchManager._getDataUUIDsForRepo(repo_id)
                    pub_uuids[repo_id] = repo_uuids
                else:
                    hits = []
                    highs = r.meta.highlight.to_dict()
                    for high_field,high_value in highs.items():
                        hits.append({high_field: high_value})
                    data[r.meta.id] = {'id': r.meta.id, "hits": hits}
                    uuids.append(r.meta.id)
            qString = "MATCH (r:Repository)--(t:SchemaType)--(d:Data) WHERE d.uuid IN {uuids} AND r.published = '1' RETURN d.uuid as uuid, r.name as repo_name, r.uuid as repo_id"
            pub_data = db.run(qString, {"uuids": uuids})
            data_max = 0
            for checked in pub_data:
                if data_max >= 32:
                    break;
                ret['nodes'].append({"id": checked['uuid'], "type": "Data", "repo_id": checked['repo_id'], "repo_name": checked['repo_name'], "hits": data[checked['uuid']]['hits']})
                data_max += 1

            return ret
        else:
            return ret
Example #40
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']
        self._build_fields()

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError('_results_number too large')
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value, full=False)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                if not value:
                    continue

                field_name = self.get_field_name(value)
                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=facets_size,
                )

        # Create signature aggregations.
        if params.get('_aggs.signature'):
            sig_bucket = A(
                'terms',
                field=self.get_field_name('signature'),
                size=facets_size,
            )
            for param in params['_aggs.signature']:
                for value in param.value:
                    if not value:
                        continue

                    if value.startswith('_histogram.'):
                        # This is a histogram aggregation we want to run,
                        # not a terms aggregation.
                        field_name = value[len('_histogram.'):]
                        if field_name not in self.histogram_fields:
                            continue

                        histogram_type = (
                            self.all_fields[field_name]['query_type'] == 'date'
                            and 'date_histogram' or 'histogram'
                        )
                        sig_bucket.bucket(
                            'histogram_%s' % field_name,
                            histogram_type,
                            field=self.get_field_name(field_name),
                            interval=histogram_intervals[field_name],
                        )
                    else:
                        sig_bucket.bucket(
                            value,
                            'terms',
                            field=self.get_field_name(value),
                            size=facets_size,
                        )

            search.aggs.bucket('signature', sig_bucket)

        # Create histograms.
        for f in self.histogram_fields:
            if params.get('_histogram.%s' % f):
                histogram_type = (
                    self.all_fields[f]['query_type'] == 'date'
                    and 'date_histogram' or 'histogram'
                )
                date_bucket = A(
                    histogram_type,
                    field=self.get_field_name(f),
                    interval=histogram_intervals[f],
                )
                for param in params['_histogram.%s' % f]:
                    for value in param.value:
                        if not value:
                            continue

                        field_name = self.get_field_name(value)
                        val_bucket = A(
                            'terms',
                            field=field_name,
                            size=facets_size,
                        )
                        date_bucket.bucket(value, val_bucket)

                search.aggs.bucket('histogram_%s' % f, date_bucket)

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break
Example #41
0
        for greater, lesser in time_range:
            print(greater, " ", lesser)


            s = Search(using=client, index=index_name, extra={ "size": 10000, "timeout": "20m"} ) \
                .query("match", type="nginx-access")  \
                .query(Q("match", tags="us-nginx-access")) \
                .query(Q("match", request_url=url))

            if(lesser != 3600):
                s=s.filter('range', **{"response_time": {"gte": greater, "lte": lesser}})
            else:
                s=s.filter('range', **{"response_time": {"gte": greater}})

            count=s.count()
            print("url : " , url , " count : " , count)

            s.aggs.metric('response_time', 'avg', field='response_time')
            response = s.execute()
            avg = response.aggregations.response_time.value
            print("url : " , url , " average : " , response.aggregations)

            s.aggs.metric('response_time', 'max', field='response_time')
            response = s.execute(ignore_cache=True)
            max_time = response.aggregations.response_time.value
            print("url : " , url , " max_time : " , max_time)

            s.aggs.metric('response_time', 'min', field='response_time')
            response = s.execute(ignore_cache=True)
            min_time = response.aggregations.response_time.value
Example #42
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = None

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:

                if param.name.startswith('_'):
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]

                name = '%s.%s' % (
                    field_data['namespace'],
                    field_data['in_database_name']
                )

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                args = {}
                filter_type = 'term'
                filter_value = None
                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]
                        if not isinstance(val, basestring) or (
                            isinstance(val, basestring) and ' ' not in val
                        ):
                            filter_value = val

                        # If the term contains white spaces, we want to perform
                        # a phrase query. Thus we do nothing here and let this
                        # value be handled later.
                    else:
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator == '>':
                    # greater than
                    filter_type = 'range'
                    filter_value = {
                        'gt': param.value
                    }
                elif param.operator == '<':
                    # lower than
                    filter_type = 'range'
                    filter_value = {
                        'lt': param.value
                    }
                elif param.operator == '>=':
                    # greater than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'gte': param.value
                    }
                elif param.operator == '<=':
                    # lower than or equal to
                    filter_type = 'range'
                    filter_value = {
                        'lte': param.value
                    }
                elif param.operator == '__null__':
                    # is null
                    filter_type = 'missing'
                    args['field'] = name

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    if param.operator_not:
                        new_filter = ~F(filter_type, **args)
                    else:
                        new_filter = F(filter_type, **args)

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif param.data_type == 'enum':
                        sub_filters |= new_filter
                    else:
                        sub_filters &= new_filter

                    continue

                # These use a wildcard and thus need to be in a query
                # instead of a filter.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '$': '%s*',  # starts with
                    '^': '*%s'  # ends with
                }
                if param.operator in operator_wildcards:
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    query_type = 'wildcard'
                    args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                elif not param.operator:
                    # This is a phrase that was passed down.
                    query_type = 'simple_query_string'
                    args['query'] = param.value[0]
                    args['fields'] = [name]
                    args['default_operator'] = 'and'

                if args:
                    query = Q(query_type, **args)
                    if param.operator_not:
                        query = ~query
                    search = search.query(query)
                else:
                    # If we reach this point, that means the operator is
                    # not supported, and we should raise an error about that.
                    raise NotImplementedError(
                        'Operator %s is not supported' % param.operator
                    )

            if filters is None:
                filters = sub_filters
            elif sub_filters is not None:
                filters &= sub_filters

        search = search.filter(filters)

        # Restricting returned fields.
        fields = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't restrict on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot return it' % value
                    )

                if not field_['is_returned']:
                    # Returning this field is not allowed.
                    raise BadArgumentError(
                        value,
                        msg='Field "%s" is not allowed to be returned' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product and descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't sort on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot sort on it' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        for param in params['_facets']:
            for value in param.value:
                try:
                    field_ = self.all_fields[value]
                except KeyError:
                    # That is not a known field, we can't facet on it.
                    raise BadArgumentError(
                        value,
                        msg='Unknown field "%s", cannot facet on it' % value
                    )

                field_name = '%s.%s' % (
                    field_['namespace'],
                    field_['in_database_name']
                )

                if field_['has_full_version']:
                    # If the param has a full version, that means what matters
                    # is the full string, and not its individual terms.
                    field_name += '.full'

                search.aggs.bucket(
                    value,
                    'terms',
                    field=field_name,
                    size=self.config.facets_max_number
                )

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()
                aggregations = self.format_aggregations(results.aggregations)
                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    break
Example #43
0
    def get(self, **kwargs):
        """Return a list of results and aggregations based on parameters.

        The list of accepted parameters (with types and default values) is in
        the database and can be accessed with the super_search_fields service.
        """
        # Require that the list of fields be passed.
        if not kwargs.get('_fields'):
            raise MissingArgumentError('_fields')
        self.all_fields = kwargs['_fields']

        # Filter parameters and raise potential errors.
        params = self.get_parameters(**kwargs)

        # Find the indices to use to optimize the elasticsearch query.
        indices = self.get_indices(params['date'])

        # Create and configure the search object.
        search = Search(
            using=self.get_connection(),
            index=indices,
            doc_type=self.config.elasticsearch.elasticsearch_doctype,
        )

        # Create filters.
        filters = []
        histogram_intervals = {}

        for field, sub_params in params.items():
            sub_filters = None
            for param in sub_params:
                if param.name.startswith('_'):
                    # By default, all param values are turned into lists,
                    # even when they have and can have only one value.
                    # For those we know there can only be one value,
                    # so we just extract it from the made-up list.
                    if param.name == '_results_offset':
                        results_from = param.value[0]
                    elif param.name == '_results_number':
                        results_number = param.value[0]
                        if results_number > 1000:
                            raise BadArgumentError(
                                '_results_number',
                                msg=(
                                    '_results_number cannot be greater '
                                    'than 1,000'
                                )
                            )
                        if results_number < 0:
                            raise BadArgumentError(
                                '_results_number',
                                msg='_results_number cannot be negative'
                            )
                    elif param.name == '_facets_size':
                        facets_size = param.value[0]
                        # Why cap it?
                        # Because if the query is covering a lot of different
                        # things you can get a really really large query
                        # which can hog resources excessively.
                        # Downloading, as an example, 100k facets (and 0 hits)
                        # when there is plenty of data yields a 11MB JSON
                        # file.
                        if facets_size > 10000:
                            raise BadArgumentError(
                                '_facets_size greater than 10,000'
                            )

                    for f in self.histogram_fields:
                        if param.name == '_histogram_interval.%s' % f:
                            histogram_intervals[f] = param.value[0]

                    # Don't use meta parameters in the query.
                    continue

                field_data = self.all_fields[param.name]
                name = self.get_full_field_name(field_data)

                if param.data_type in ('date', 'datetime'):
                    param.value = datetimeutil.date_to_string(param.value)
                elif param.data_type == 'enum':
                    param.value = [x.lower() for x in param.value]
                elif param.data_type == 'str' and not param.operator:
                    param.value = [x.lower() for x in param.value]

                # Operators needing wildcards, and the associated value
                # transformation with said wildcards.
                operator_wildcards = {
                    '~': '*%s*',  # contains
                    '^': '%s*',  # starts with
                    '$': '*%s'  # ends with
                }
                # Operators needing ranges, and the associated Elasticsearch
                # comparison operator.
                operator_range = {
                    '>': 'gt',
                    '<': 'lt',
                    '>=': 'gte',
                    '<=': 'lte',
                }

                args = {}
                filter_type = 'term'
                filter_value = None

                if not param.operator:
                    # contains one of the terms
                    if len(param.value) == 1:
                        val = param.value[0]

                        if not isinstance(val, basestring) or ' ' not in val:
                            # There's only one term and no white space, this
                            # is a simple term filter.
                            filter_value = val
                        else:
                            # If the term contains white spaces, we want to
                            # perform a phrase query.
                            filter_type = 'query'
                            args = Q(
                                'simple_query_string',
                                query=param.value[0],
                                fields=[name],
                                default_operator='and',
                            ).to_dict()
                    else:
                        # There are several terms, this is a terms filter.
                        filter_type = 'terms'
                        filter_value = param.value
                elif param.operator == '=':
                    # is exactly
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_range:
                    filter_type = 'range'
                    filter_value = {
                        operator_range[param.operator]: param.value
                    }
                elif param.operator == '__null__':
                    filter_type = 'missing'
                    args['field'] = name
                elif param.operator == '__true__':
                    filter_type = 'term'
                    filter_value = True
                elif param.operator == '@':
                    filter_type = 'regexp'
                    if field_data['has_full_version']:
                        name = '%s.full' % name
                    filter_value = param.value
                elif param.operator in operator_wildcards:
                    filter_type = 'query'

                    # Wildcard operations are better applied to a non-analyzed
                    # field (called "full") if there is one.
                    if field_data['has_full_version']:
                        name = '%s.full' % name

                    q_args = {}
                    q_args[name] = (
                        operator_wildcards[param.operator] % param.value
                    )
                    query = Q('wildcard', **q_args)
                    args = query.to_dict()

                if filter_value is not None:
                    args[name] = filter_value

                if args:
                    new_filter = F(filter_type, **args)
                    if param.operator_not:
                        new_filter = ~new_filter

                    if sub_filters is None:
                        sub_filters = new_filter
                    elif filter_type == 'range':
                        sub_filters &= new_filter
                    else:
                        sub_filters |= new_filter

                    continue

            if sub_filters is not None:
                filters.append(sub_filters)

        search = search.filter(F('bool', must=filters))

        # Restricting returned fields.
        fields = []

        # We keep track of the requested columns in order to make sure we
        # return those column names and not aliases for example.
        self.request_columns = []
        for param in params['_columns']:
            for value in param.value:
                if not value:
                    continue

                self.request_columns.append(value)
                field_name = self.get_field_name(value, full=False)
                fields.append(field_name)

        search = search.fields(fields)

        # Sorting.
        sort_fields = []
        for param in params['_sort']:
            for value in param.value:
                if not value:
                    continue

                # Values starting with a '-' are sorted in descending order.
                # In order to retrieve the database name of the field, we
                # must first remove the '-' part and add it back later.
                # Example: given ['product', '-version'], the results will be
                # sorted by ascending product then descending version.
                desc = False
                if value.startswith('-'):
                    desc = True
                    value = value[1:]

                field_name = self.get_field_name(value)

                if desc:
                    # The underlying library understands that '-' means
                    # sorting in descending order.
                    field_name = '-' + field_name

                sort_fields.append(field_name)

        search = search.sort(*sort_fields)

        # Pagination.
        results_to = results_from + results_number
        search = search[results_from:results_to]

        # Create facets.
        if facets_size:
            self._create_aggregations(
                params,
                search,
                facets_size,
                histogram_intervals
            )

        # Query and compute results.
        hits = []

        if params['_return_query'][0].value[0]:
            # Return only the JSON query that would be sent to elasticsearch.
            return {
                'query': search.to_dict(),
                'indices': indices,
            }

        errors = []

        # We call elasticsearch with a computed list of indices, based on
        # the date range. However, if that list contains indices that do not
        # exist in elasticsearch, an error will be raised. We thus want to
        # remove all failing indices until we either have a valid list, or
        # an empty list in which case we return no result.
        while True:
            try:
                results = search.execute()
                for hit in results:
                    hits.append(self.format_fields(hit.to_dict()))

                total = search.count()

                aggregations = getattr(results, 'aggregations', {})
                if aggregations:
                    aggregations = self.format_aggregations(aggregations)

                shards = getattr(results, '_shards', {})

                break  # Yay! Results!
            except NotFoundError, e:
                missing_index = re.findall(BAD_INDEX_REGEX, e.error)[0]
                if missing_index in indices:
                    del indices[indices.index(missing_index)]
                else:
                    # Wait what? An error caused by an index that was not
                    # in the request? That should never happen, but in case
                    # it does, better know it.
                    raise

                errors.append({
                    'type': 'missing_index',
                    'index': missing_index,
                })

                if indices:
                    # Update the list of indices and try again.
                    # Note: we need to first empty the list of indices before
                    # updating it, otherwise the removed indices never get
                    # actually removed.
                    search = search.index().index(*indices)
                else:
                    # There is no index left in the list, return an empty
                    # result.
                    hits = []
                    total = 0
                    aggregations = {}
                    shards = None
                    break
            except RequestError as exception:
                # Try to handle it gracefully if we can find out what
                # input was bad and caused the exception.
                try:
                    bad_input = ELASTICSEARCH_PARSE_EXCEPTION_REGEX.findall(
                        exception.error
                    )[-1]
                    # Loop over the original parameters to try to figure
                    # out which *key* had the bad input.
                    for key, value in kwargs.items():
                        if value == bad_input:
                            raise BadArgumentError(key)
                except IndexError:
                    # Not an ElasticsearchParseException exception
                    pass
                raise