Example #1
0
    def test_es_manager(self):
        """Test the behavior of the ``es_manager`` command"""

        if not self.index_manager.connected_to_es:
            return

        # in the beginning: the void
        self.assertTrue(self.index_manager.index not in
                        self.index_manager.es.cat.indices())

        text = "Ceci est un texte de test"

        # create a topic with a post
        topic = TopicFactory(forum=self.forum, author=self.user, title=text)
        post = PostFactory(topic=topic, author=self.user, position=1)
        post.text = post.text_html = text
        post.save()

        topic = Topic.objects.get(pk=topic.pk)
        post = Post.objects.get(pk=post.pk)

        self.assertFalse(topic.es_already_indexed)
        self.assertTrue(topic.es_flagged)
        self.assertFalse(post.es_already_indexed)
        self.assertTrue(post.es_flagged)

        # create a middle-tutorial and publish it
        tuto = PublishableContentFactory(type="TUTORIAL")
        tuto.authors.add(self.user)
        tuto.save()

        tuto_draft = tuto.load_version()
        chapter1 = ContainerFactory(parent=tuto_draft, db_object=tuto)
        chapter1.repo_update(text, text, text)
        extract1 = ExtractFactory(container=chapter1, db_object=tuto)
        version = extract1.repo_update(text, text)
        published = publish_content(tuto, tuto_draft, is_major_update=True)

        tuto.sha_public = version
        tuto.sha_draft = version
        tuto.public_version = published
        tuto.save()

        published = PublishedContent.objects.get(content_pk=tuto.pk)
        self.assertFalse(published.es_already_indexed)
        self.assertTrue(published.es_flagged)

        # 1. test "index-all"
        call_command("es_manager", "index_all")
        self.assertTrue(
            self.index_manager.es.indices.exists(self.index_manager.index))
        self.index_manager.index_exists = True

        topic = Topic.objects.get(pk=topic.pk)
        post = Post.objects.get(pk=post.pk)

        self.assertTrue(topic.es_already_indexed)
        self.assertFalse(topic.es_flagged)
        self.assertTrue(post.es_already_indexed)
        self.assertFalse(post.es_flagged)

        published = PublishedContent.objects.get(content_pk=tuto.pk)
        self.assertTrue(published.es_already_indexed)
        self.assertFalse(published.es_flagged)

        s = Search()
        s.query(MatchAll())
        results = self.index_manager.setup_search(s).execute()
        self.assertEqual(len(results), 4)  # get 4 results, one of each type

        must_contain = {
            "post": False,
            "topic": False,
            "publishedcontent": False,
            "chapter": False
        }
        id_must_be = {
            "post": str(post.pk),
            "topic": str(topic.pk),
            "publishedcontent": str(published.pk),
            "chapter": tuto.slug + "__" + chapter1.slug,
        }

        for hit in results:
            doc_type = hit.meta.doc_type
            must_contain[doc_type] = True
            self.assertEqual(hit.meta.id, id_must_be[doc_type])

        self.assertTrue(all(must_contain))

        # 2. test "clear"
        self.assertTrue(self.index_manager.index
                        in self.index_manager.es.cat.indices())  # index in

        call_command("es_manager", "clear")
        self.assertFalse(
            self.index_manager.es.indices.exists(self.index_manager.index))
        self.index_manager.index_exists = False

        # must reset every object
        topic = Topic.objects.get(pk=topic.pk)
        post = Post.objects.get(pk=post.pk)

        self.assertFalse(topic.es_already_indexed)
        self.assertTrue(topic.es_flagged)
        self.assertFalse(post.es_already_indexed)
        self.assertTrue(post.es_flagged)

        published = PublishedContent.objects.get(content_pk=tuto.pk)
        self.assertFalse(published.es_already_indexed)
        self.assertTrue(published.es_flagged)

        self.assertTrue(
            self.index_manager.index
            not in self.index_manager.es.cat.indices())  # index wiped out !

        # 3. test "setup"
        call_command("es_manager", "setup")
        self.assertTrue(
            self.index_manager.es.indices.exists(self.index_manager.index))
        self.index_manager.index_exists = True

        self.assertTrue(
            self.index_manager.index
            in self.index_manager.es.cat.indices())  # index back in ...

        s = Search()
        s.query(MatchAll())
        results = self.index_manager.setup_search(s).execute()
        self.assertEqual(len(results), 0)  # ... but with nothing in it

        result = self.index_manager.es.indices.get_settings(
            index=self.index_manager.index)
        settings_index = result[self.index_manager.index]["settings"]["index"]
        self.assertTrue("analysis"
                        in settings_index)  # custom analyzer was setup

        # 4. test "index-flagged" once ...
        call_command("es_manager", "index_flagged")

        topic = Topic.objects.get(pk=topic.pk)
        post = Post.objects.get(pk=post.pk)

        self.assertTrue(topic.es_already_indexed)
        self.assertFalse(topic.es_flagged)
        self.assertTrue(post.es_already_indexed)
        self.assertFalse(post.es_flagged)

        published = PublishedContent.objects.get(content_pk=tuto.pk)
        self.assertTrue(published.es_already_indexed)
        self.assertFalse(published.es_flagged)

        s = Search()
        s.query(MatchAll())
        results = self.index_manager.setup_search(s).execute()
        self.assertEqual(len(results), 4)  # get the 4 results back
Example #2
0
from elasticsearch_dsl import  connections,Document,Text,Keyword,Date,Search
from elasticsearch import  Elasticsearch
connections.create_connection(hosts=['127.0.0.1'], timeout=20)
es=Elasticsearch()

class Blogpost(Document):
    Title=Text()
    published_date=Date()
    published_by=Text()
    tags=Keyword()
    body=Text()

    class Index:
        name='blog_index' #index will be created automatically

bg=Blogpost()
bg.title="A better way to use elasticsearch python"
bg.published_date="12-01-2019"
bg.published_by="Kartik"
bg.tags=['tech','elasticsearch','python']
bg.body="Who does not love object-oriented programming paradigms"
bg.save()


s = Search(using=es, index="blog_index").filter("term", tags="tech").query("match", title="python")\
    .exclude("match", body="Hello world")
 def __init__(self, host: str, index_name: str, *args, **kwargs):
     self.search = Search(using=Elasticsearch(hosts=host), index=index_name)
Example #4
0
def dashboard(request):
    count_ips, count_domains = total_data()
    count_bugs = total_bug()
    total = {
        "ips": count_ips,
        "domains": count_domains,
        "bugs": count_bugs
    }
    # 资产相关
    data = properly.objects.order_by("-id").all()

    # 图表统计
    payload = {"size": 0,
               "aggs": {
                   "sales": {
                       "date_histogram": {
                           "field": "published_from",
                           "interval": STATIC_TASKS,
                           "format": "yyyy-MM-dd"
                       }
                   }
               }
               }
    s = Search(using=es, index='w12scan').from_dict(payload)
    res = s.execute().to_dict()
    try:
        charts = res["aggregations"]["sales"]["buckets"]
    except KeyError:
        charts = []
    data_chart = {
        "labels": [],
        "data": []
    }
    for item in charts:
        count = item["doc_count"]
        if count == 0:
            continue
        data_chart["labels"].append(item["key_as_string"])
        data_chart["data"].append(item["doc_count"])

    # Bar chart
    names = count_name(6)
    data_bar = {
        "labels": [],
        "data": []
    }
    for item in names:
        data_bar["labels"].append(item["key"])
        data_bar["data"].append(item["doc_count"])

    # node monitor
    nodenames = redis_con.keys("w12_node_*")
    nodes = []
    for nodename in nodenames:
        dd = redis_con.hgetall(nodename)
        tem_dict = {}
        tem_dict["nodename"] = lstrsub(nodename, "w12_node_")
        tem_dict["last_time"] = dd.get("last_time", 0)
        tem_dict["tasks"] = dd.get("tasks", "error")
        tem_dict["running"] = dd.get("running", "error")
        tem_dict["finished"] = dd.get("finished", "error")
        tem_dict["status"] = "Running"
        if time.time() - float(tem_dict["last_time"]) > 60 * 5:
            tem_dict["status"] = "Pending"
        tem_dict["time"] = smartDate(float(tem_dict["last_time"]))
        nodes.append(tem_dict)

    # bug[domain]漏洞图表展示
    dd = es.indices.get_mapping(index='w12scan', doc_type='domains')
    dd = dd["w12scan"]["mappings"]["domains"]["properties"]
    data_bugs = []
    if "bugs" in dd:
        bug_type = dd["bugs"]["properties"].keys()
        index = 0
        for bug_name in bug_type:
            index += 1
            count = get_bug_count('domains', bug_name)
            dd = {}
            _cls = ["primary", "info", "danger", "success", "warning"]
            dd["label"] = bug_name
            dd["count"] = count
            dd["cls"] = _cls[index % 5]
            data_bugs.append(dd)

    return render(request, "frontend/dashboard.html",
                  {"total": total, "zc_data": data, "data_chart": data_chart, "data_bar": data_bar, "nodes": nodes,
                   "data_bugs": data_bugs})
Example #5
0
def index(request):
    page = request.GET.get("p", "1")
    q = request.GET.get("q", None)
    try:
        page = int(page)
    except:
        page = 1
    if page <= 0:
        page = 1

    es = Elasticsearch(ELASTICSEARCH_HOSTS)
    start_time = datetime.now()
    keywords = None
    if q is None:
        _search = {
            "from": (page - 1) * 20,
            "size": 20,
            "sort": {"published_from": {"order": "desc"}}
        }
    else:
        _search, keywords = k2e_search(q, page)
    s = Search(using=es, index='w12scan').from_dict(_search)
    count = s.execute().hits.total

    # 分页逻辑
    max_page = math.ceil(count / 20)
    if page <= 5:
        paginations = range(1, 10)
    elif page + 5 > max_page:
        paginations = range(max_page - 5, max_page + 5)
    else:
        paginations = range(page - 5, page + 5)
    temp_pagin = []
    for i in paginations:
        if i <= max_page:
            temp_pagin.append(i)
    paginations = temp_pagin

    pagination = {
        "max_page": str(max_page),
        "current": page,
        "pre": str(page - 1) if page - 1 > 0 else "1",
        "next": str(page + 1) if page + 1 <= max_page else str(max_page),
        "paginations": paginations,
        "keyword": ""
    }
    if q is not None:
        pagination["keyword"] = "&q=" + q
    # 分页完

    datas = []
    for hit in s:
        doc_type = hit.meta.doc_type
        id = hit.meta.id
        d = {}
        if doc_type == "ips":
            d.update(hit.to_dict())
            if d.get("infos"):
                d["info_tags"] = []
                for info in d["infos"]:
                    d["info_tags"].append("{}/{}".format(info["port"], info.get("name", "unknown")))
                d["infos"] = json.dumps(d["infos"], indent=2)
            # 资产关联
            d["proper"] = is_proper(d["target"], "ip")
        elif doc_type == "domains":
            d.update(hit.to_dict())
            d["target"] = d.get("title") or d.get("url")
            if d.get("ip"):
                ip = d.get("ip")
                ip_info = es_search_ip(ip, True)
                if ip_info:
                    d["location"] = ip_info.location
            d["proper"] = is_proper(d["url"], "domain")
        d["doc_type"] = doc_type
        d["id"] = id
        d["published_from"] = datetime_string_format(d["published_from"])
        datas.append(d)

    # 左侧统计代码逻辑
    statistics = {}
    # 1.组件统计
    apps = count_app()
    countrys = count_country()
    names = count_name()
    ports = count_port()
    statistics["apps"] = apps
    statistics["countrys"] = countrys
    statistics["names"] = names
    statistics["ports"] = ports

    # 总耗时间
    end_time = (datetime.now() - start_time).total_seconds()

    return render(request, "frontend/recent.html",
                  {"datas": datas, "count": count, "second": end_time, "pagination": pagination,
                   "statistics": statistics, "keyword": keywords})
Example #6
0
from datetime import datetime
from elasticsearch import Elasticsearch, NotFoundError
from elasticsearch_dsl import Search
from es_models import Items, Bids
# from view_helper import create_bid_id

es = Elasticsearch()

s = Search(es, index='item', doc_type='item')

INITIAL_ITEM_STATUS = 'Available'


def create_bid_id(bidder, item):
    return (bidder + '_' + item.item_name)


def add_item(seller, item_name, min_bid):
    item = Items(meta={'id': item_name})
    item.item_name = item_name
    item.status = INITIAL_ITEM_STATUS
    item.seller = seller
    # item.created_at = datetime.now().strftime("%a %b %d %H:%M:%S %Y")
    item.min_bid = min_bid
    item.sold_to = None
    item.save()
    return item


def create_bid(item, bidder, bid_amount):
    bid = Bids()
def checkticket(ip_src, cipher, msg_type, timestamp):

    # Wait output to Elasticsearch
    time.sleep(3)

    es = Elasticsearch('10.0.19.112:9200')
    s = Search(using=es, index="cipher-*")
    s = s[0:10000]

    # Check old cipher
    if msg_type == 12:
        q = (Q('match', layers__kerberos_msg_type=11)
             | Q('match', layers__kerberos_msg_type=13)) & Q(
                 'match', layers__ip_dst__keyword=ip_src) & Q(
                     'match', layers__kerberos_cipher__keyword=cipher)
    if msg_type == 14:
        q = Q('match', layers__kerberos_msg_type=13) & Q(
            'match', layers__ip_dst__keyword=ip_src) & Q(
                'match', layers__kerberos_cipher__keyword=cipher)
    s1 = s.query(q)
    response = s1.execute()
    if len(response) != 0:
        print('matched with old cipher at ' + str(timestamp))

    else:
        # Check TKT Expire
        qtime = Q('range',
                  timestamp={
                      'gte': int(timestamp),
                      'lte': int(timestamp) + 1000
                  }) & Q('match', layers__ip_dst__keyword=ip_src) & Q(
                      'match', layers__kerberos_error_code=32)
        s2 = s.query(qtime)
        response2 = s2.execute()
        if len(response2) != 0:
            print('TKT Expired at ' + str(timestamp))

        else:
            # Update packet-* index
            time.sleep(5)
            s = Search(using=es, index="packet-*")
            s = s[0:10000]
            qsilver = Q('match', layers__kerberos_cipher__keyword=cipher)
            s3 = s.query(qsilver)
            response3 = s3.execute()

            for h in response3:
                id = h.meta.id
                index = h.meta.index
                if msg_type == 12:
                    detect_golden.detect_golden(ip_src)
                    es.update(index=index,
                              doc_type='doc',
                              id=id,
                              body={
                                  'doc': {
                                      'indicator':
                                      'attack: Golden Ticket is used'
                                  }
                              })
                if msg_type == 14:
                    es.update(index=index,
                              doc_type='doc',
                              id=id,
                              body={
                                  'doc': {
                                      'indicator':
                                      'attack: Silver Ticket is used'
                                  }
                              })
            if msg_type == 12:
                print('Golden ticket was used on ' + str(ip_src) + ' at ' +
                      str(timestamp))
            if msg_type == 14:
                print('Silver ticket was used on ' + str(ip_src) + ' at ' +
                      str(timestamp))
def search(search_params,
           index,
           page_size,
           ip,
           request,
           filter_dead,
           page=1) -> Tuple[List[Hit], int, int]:
    """
    Given a set of keywords and an optional set of filters, perform a ranked
    paginated search.

    :param search_params: Search parameters. See
     :class: `ImageSearchQueryStringSerializer`.
    :param index: The Elasticsearch index to search (e.g. 'image')
    :param page_size: The number of results to return per page.
    :param ip: The user's hashed IP. Hashed IPs are used to anonymously but
    uniquely identify users exclusively for ensuring query consistency across
    Elasticsearch shards.
    :param request: Django's request object.
    :param filter_dead: Whether dead links should be removed.
    :param page: The results page number.
    :return: Tuple with a List of Hits from elasticsearch, the total count of
    pages and results.
    """
    s = Search(index=index)
    # Apply term filters. Each tuple pairs a filter's parameter name in the API
    # with its corresponding field in Elasticsearch. "None" means that the
    # names are identical.
    filters = [('extension', None), ('categories', None),
               ('aspect_ratio', None), ('size', None), ('source', 'provider'),
               ('license', 'license__keyword'),
               ('license_type', 'license__keyword')]
    for tup in filters:
        api_field, elasticsearch_field = tup
        s = _apply_filter(s, search_params, api_field, elasticsearch_field)
    # Get suggestions for any route
    s = s.suggest('get_suggestion', '', term={'field': 'creator'})
    # Exclude mature content unless explicitly enabled by the requester
    if not search_params.data['mature']:
        s = s.exclude('term', mature=True)
    # Hide data sources from the catalog dynamically.
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = models.ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(key=filter_cache_key,
                  timeout=CACHE_TIMEOUT,
                  value=filtered_providers)
    to_exclude = [f['provider_identifier'] for f in filtered_providers]
    s = s.exclude('terms', provider=to_exclude)

    # Search either by generic multimatch or by "advanced search" with
    # individual field-level queries specified.
    search_fields = ['tags.name', 'title', 'description']
    if 'q' in search_params.data:
        query = _quote_escape(search_params.data['q'])
        s = s.query('simple_query_string', query=query, fields=search_fields)
        # Get suggestions for term query
        s = s.suggest('get_suggestion', query, term={'field': 'creator'})
    else:
        if 'creator' in search_params.data:
            creator = _quote_escape(search_params.data['creator'])
            s = s.query('simple_query_string',
                        query=creator,
                        fields=['creator'])
            # Get suggestions for creator
            s = s.suggest('get_suggestion', creator, term={'field': 'creator'})
        if 'title' in search_params.data:
            title = _quote_escape(search_params.data['title'])
            s = s.query('simple_query_string', query=title, fields=['title'])
            # Get suggestions for title
            s = s.suggest('get_suggestion', title, term={'field': 'title'})
        if 'tags' in search_params.data:
            tags = _quote_escape(search_params.data['tags'])
            s = s.query('simple_query_string',
                        fields=['tags.name'],
                        query=tags)
            # Get suggestions for tags
            s = s.suggest('get_suggestion', tags, term={'field': 'tags.name'})
    # Boost by popularity metrics
    if POPULARITY_BOOST:
        queries = []
        factors = ['comments', 'views', 'likes']
        boost_factor = 100 / len(factors)
        for factor in factors:
            rank_feature_query = Q('rank_feature',
                                   field=factor,
                                   boost=boost_factor)
            queries.append(rank_feature_query)
        s = Search().query(
            Q('bool', must=s.query, should=queries, minimum_should_match=1))

    # Use highlighting to determine which fields contribute to the selection of
    # top results.
    s = s.highlight(*search_fields)
    s = s.highlight_options(order='score')
    s.extra(track_scores=True)
    # Route users to the same Elasticsearch worker node to reduce
    # pagination inconsistencies and increase cache hits.
    s = s.params(preference=str(ip), request_timeout=7)
    # Paginate
    start, end = _get_query_slice(s, page_size, page, filter_dead)
    s = s[start:end]
    try:
        search_response = s.execute()
        log.info(f'query={s.to_dict()}, es_took_ms={search_response.took}')
    except RequestError as e:
        raise ValueError(e)
    results = _post_process_results(s, start, end, page_size, search_response,
                                    request, filter_dead)

    suggestion = _query_suggestions(search_response)

    result_count, page_count = _get_result_and_page_count(
        search_response, results, page_size)

    return results, page_count, result_count, suggestion
Example #9
0
    def search(self, keyword):
        s = Search(using=self.client, index=self.news_index)\
            .query("match", content=keyword)

        response = s.execute()
        return response
Example #10
0
from elasticsearch_dsl import Search

from nlpmonitor.settings import ES_CLIENT, ES_INDEX_DOCUMENT_EVAL, ES_INDEX_DOCUMENT

from sklearn.metrics import classification_report, roc_auc_score

tm_name = "bigartm_two_years_old_parse"
criterion_id = 35

s = Search(using=ES_CLIENT,
           index=f"{ES_INDEX_DOCUMENT_EVAL}_{tm_name}_{criterion_id}")
s = s.source(('value', 'document_es_id'))

# Values
document_values = dict((h.document_es_id, h.value) for h in s.scan())

# Resonance thresholds
std = Search(using=ES_CLIENT, index=f"{ES_INDEX_DOCUMENT}") \
          .filter("range", num_views={"gt": 0})
std.aggs.bucket("sources", agg_type="terms", field="source", size=100) \
    .metric("stats", agg_type="extended_stats", field="num_views")
r = std.execute()

source_resonances = dict(
    (bucket.key, bucket.stats) for bucket in r.aggregations.sources.buckets)
source_resonance_means = dict(
    ((source, stats.avg) for source, stats in source_resonances.items()))
source_resonance_stds = dict(((source, stats.std_deviation)
                              for source, stats in source_resonances.items()))
sigma_threshold = 1
source_resonance_thresholds = dict(
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search

client = Elasticsearch(timeout=60)
s = Search(using=client, index="winlogbeat-*")
s.source(includes=['winlog.provider_name', 'winlog.event_id'])

maps = {}

count = 0
for hit in s.scan():
    if not hit.winlog.provider_name in maps:
        maps[hit.winlog.provider_name] = {}

    if not hit.winlog.event_id in maps[hit.winlog.provider_name]:
        maps[hit.winlog.provider_name][hit.winlog.event_id] = 1
    else:
        maps[hit.winlog.provider_name][hit.winlog.event_id] += 1

    count += 1

    if count % 1000 == 0:
        print("Progress %d" % count)

for (provider_name, v) in maps.items():
    for (event_id, count) in v.items():
        print('%s: %d (%d)' % (provider_name, event_id, count))
Example #12
0
def search(page='0', dic=None, content=None):

    limitpage = 15
    validresult = False
    orderlabel = 0
    orderarray = []
    if content is not None:
        q = Q("multi_match",
              query=content,
              fields=[
                  'ip', 'name', 'product', 'script', 'detail', 'head',
                  'hackinfo', 'keywords', 'disclosure'
              ])

    else:
        searcharray = []
        keys = dic.keys()
        orderlabel = 0

        for key in keys:
            if key == 'name':
                searcharray.append(Q('term', name=dic[key]))
            if key == 'ip':
                searcharray.append(Q('term', ip=dic[key]))
            if key == 'port':
                searcharray.append(Q('term', port=dic[key]))
            if key == 'state':
                searcharray.append(Q('term', state=dic[key]))
            if key == 'timesearch':
                searcharray.append(Q('match', timesearch=dic[key]))
            if key == 'keywords':
                searcharray.append(Q('match', keywords=dic[key]))
            if key == 'product':
                searcharray.append(Q('match', product=dic[key]))
            if key == 'version':
                searcharray.append(Q('match', version=dic[key]))
            if key == 'script':
                searcharray.append(Q('match', script=dic[key]))
            if key == 'hackinfo':
                searcharray.append(Q('match', hackinfo=dic[key]))
            if key == 'head':
                searcharray.append(Q('match', head=dic[key]))
            if key == 'detail':
                searcharray.append(Q('match', detail=dic[key]))
            if key == 'disclosure':
                searcharray.append(Q('match', disclosure=dic[key]))
            if key == 'webtitle':
                searcharray.append(Q('match', webtitle=dic[key]))
            if key == 'webkeywords':
                searcharray.append(Q('match', webkeywords=dic[key]))
            if key == 'order':
                orderarray.append(dic[key])
                orderlabel = 1

        q = Q('bool', must=searcharray)

    if orderlabel == 0:
        s = Search(index='datap', doc_type='snifferdata').query(q)
    else:
        s = Search(index='datap',
                   doc_type='snifferdata').query(q).sort(orderarray[0])

#     s = Search.from_dict({"query": {
#     "bool":{
#             "must":[
#                 {
#                     "term":{"name":"http"},
#                     "term":{"port":"80"},
#
#                 },
#                 {
#                     "match":{"head":"manager"},
#                      "match":{"head":"200"},
#                 }
#                 ]
#         }
#
# }
# })
    s = s[int(page) * limitpage:int(page) * limitpage + limitpage]

    response = s.execute()
    if response.success():

        portarray = []
        count = response.hits.total
        print '返回的集中率为%d' % count
        if count == 0:
            pagecount = 0
        elif count % limitpage > 0:

            pagecount = int((count + limitpage - 1) / limitpage)

        else:
            pagecount = count / limitpage
        from nmaptoolbackground.model import ports
        count = len(response)
        print '返回的实际数量为%d' % count
        if count > 0:
            for temp in response:
                dic = temp.to_dict()
                aport = ports.Port(
                    ip=getproperty(dic, 'ip'),
                    port=getproperty(dic, 'port'),
                    timesearch=getproperty(dic, 'timesearch'),
                    state=getproperty(dic, 'state'),
                    name=getproperty(dic, 'name'),
                    product=getproperty(dic, 'product'),
                    version=getproperty(dic, 'version'),
                    script=base64.b64encode(str(getproperty(dic, 'script'))),
                    detail=getproperty(dic, 'detail'),
                    head=getproperty(dic, 'head'),
                    city='',
                    hackinfo=getproperty(dic, 'hackinfo'),
                    disclosure=getproperty(dic, 'disclosure'),
                    keywords=getproperty(dic, 'keywords'),
                    webtitle=base64.b64encode(str(getproperty(dic,
                                                              'webtitle'))),
                    webkeywords=getproperty(dic, 'webkeywords'))

                # ip=getproperty(dic,'ip')
                # port=getproperty(dic,'port')
                # timesearch=getproperty(dic,'timesearch')
                # state=getproperty(dic,'state')
                # name=getproperty(dic,'name')
                # product=getproperty(dic,'product')
                # version=getproperty(dic,'version')
                # script=base64.b64encode(getproperty(dic,'script'))
                # detail=getproperty(dic,'detail')
                # head=getproperty(dic,'head')
                # city=''
                # hackinfo=getproperty(dic,'hackinfo')
                # disclosure=getproperty(dic,'disclosure')

                portarray.append(aport)

        return portarray, count, pagecount

    else:
        print '查询失败'
        return [], 0, 0
Example #13
0
def results():
    page = request.args

    page_number = int(page.get('page_number')) if page.get('page_number',
                                                           '') is not "" else 1
    query = page.get('query', '')
    if len(query) == 0:
        return redirect('/')
    search = Search(index=index_name)

    # boost field weights
    boost_weight = [
        i + 1 for i in get_classifier().predict([extract_features(query)])[0]
    ]
    fields_list = query_helper.boost_fields(boost_weight)

    # supports '|', '+', '-', "" phrase search, '*', etc.
    s = search.query('simple_query_string',
                     fields=fields_list,
                     query=query,
                     default_operator='and')

    # highlight
    query_helper.highlight(s, fields_list)

    start = 0 + (page_number - 1) * 10
    end = 10 + (page_number - 1) * 10
    message = []
    response = s[start:end].execute()

    # if there are no results, switch to disjunction search
    if response.hits.total == 0 and len(query) > 0:
        message.append(f'Unknown search term: {query},switch to disjunction.')
        s = search.query('simple_query_string',
                         fields=fields_list,
                         query=query,
                         default_operator='or')
        response = s[start:end].execute()

    # insert data into response
    result_list = query_helper.parse_result(response)
    # if there are results, insert it to query_index
    query_id = 0
    qs = ""
    try:
        qs = Query.select().where(Query.query == query)
    except Query.DoesNotExist:
        qs = None
    if result_list:
        if not qs:
            q1 = Query(query=query, result=json.dumps(result_list))
            q1.save()
            query_id = q1.id
            q = SearchQuery(query=query,
                            suggest=query,
                            meta={
                                'index': 'query_index',
                                'id': query_id
                            })
            q.save()
        else:
            query_id = Query.get(Query.query == query).id

    result_num = response.hits.total
    return render_result({
        'result_list': result_list,
        'result_num': result_num,
        'query_id': query_id,
        'query': query,
        'page_number': page_number,
        'message': message,
        'page_size': 10
    })
Example #14
0
def search(search_params, index, page_size, ip, page=1) -> Response:
    """
    Given a set of keywords and an optional set of filters, perform a ranked
    paginated search.

    :param search_params: Search parameters. See
     :class: `~cccatalog.api.search_serializers.SearchQueryStringSerializer`.
    :param index: The Elasticsearch index to search (e.g. 'image')
    :param page_size: The number of results to return per page.
    :param page: The results page number.
    :param ip: The user's hashed IP. Hashed IPs are used to anonymously but
    uniquely identify users exclusively for ensuring query consistency across
    Elasticsearch shards.
    :return: An Elasticsearch Response object.
    """
    s = Search(index=index)

    # Paginate search query.
    start_slice = page_size * (page - 1)
    end_slice = page_size * page
    if start_slice + end_slice > ELASTICSEARCH_MAX_RESULT_WINDOW:
        raise ValueError("Deep pagination is not allowed.")
    s = s[start_slice:end_slice]

    # If any filters are specified, add them to the query.
    if 'li' in search_params.data or 'lt' in search_params.data:
        license_field = 'li' if 'li' in search_params.data else 'lt'
        license_filters = []
        for _license in search_params.data[license_field].split(','):
            license_filters.append(Q('term', license__keyword=_license))
        s = s.filter('bool', should=license_filters, minimum_should_match=1)
    if 'provider' in search_params.data:
        provider_filters = []
        for provider in search_params.data['provider'].split(','):
            provider_filters.append(Q('term', provider=provider))
        s = s.filter('bool', should=provider_filters, minimum_should_match=1)

    # It is sometimes desirable to hide content providers from the catalog
    # without scrubbing them from the database or reindexing.
    filter_cache_key = 'filtered_providers'
    filtered_providers = cache.get(key=filter_cache_key)
    if not filtered_providers:
        filtered_providers = ContentProvider.objects\
            .filter(filter_content=True)\
            .values('provider_identifier')
        cache.set(key=filter_cache_key,
                  timeout=CACHE_TIMEOUT,
                  value=filtered_providers)
    for filtered in filtered_providers:
        s = s.exclude('match', provider=filtered['provider_identifier'])

    # Search either by generic multimatch or by "advanced search" with
    # individual field-level queries specified.
    if 'q' in search_params.data:
        keywords = ' '.join(search_params.data['q'].lower().split(','))
        s = s.query('constant_score',
                    filter=Q('multi_match',
                             query=keywords,
                             fields=['tags.name', 'title'],
                             operator='AND'))
    else:
        if 'creator' in search_params.data:
            creator = search_params.data['creator']
            s = s.query('constant_score', filter=Q('match', creator=creator))
        if 'title' in search_params.data:
            title = search_params.data['title']
            s = s.query('constant_score', filter=Q('match', title=title))
        if 'tags' in search_params.data:
            tags = ' '.join(search_params.data['tags'].lower().split(','))
            s = s.query('constant_score',
                        filter=Q('multi_match',
                                 fields=['tags.name'],
                                 query=tags))

    s.extra(track_scores=True)
    s = s.params(preference=str(ip))
    search_response = s.execute()
    return search_response
    def aggregate_by_event_data(self,
                                event_id=None,
                                event_data_name="Image",
                                sub_event_data_name=None,
                                bucket_size=1000,
                                sub_bucket_size=100,
                                threshold=None,
                                filter_event_data_name='',
                                filter_event_data_value='',
                                aggregate_by_hostname=False):
        es_query = self.get_default_query()

        if event_id != None:
            es_query.append({'match': {'winlog.event_id': event_id}})

        if filter_event_data_name:
            filter_field_name = 'winlog.event_data.' + filter_event_data_name
            es_query.append(
                {'match': {
                    filter_field_name: filter_event_data_value
                }})

        query = Q({'bool': {'must': es_query}})

        s = Search(using=self.Client, index="winlogbeat-*").query(query)
        if self.DTRange != None:
            s = s.filter('range', **self.DTRange)

        s.source(includes=['winlog.*'])

        if aggregate_by_hostname:
            b = s.aggs.bucket(event_data_name,
                              'terms',
                              field='agent.hostname',
                              size=bucket_size)
        else:
            b = s.aggs

        b = b.bucket(event_data_name,
                     'terms',
                     field='winlog.event_data.' + event_data_name,
                     size=bucket_size)
        if threshold:
            # https://github.com/ongr-io/ElasticsearchDSL/blob/master/docs/Aggregation/Pipeline/BucketSelector.md
            # https://elasticsearch-dsl.readthedocs.io/en/latest/search_dsl.html
            threshold_bucket_name = event_data_name + "_counts"
            b.bucket(threshold_bucket_name, 'cardinality', field='@timestamp')
            b.pipeline('threshold_bucket_selector',
                       'bucket_selector',
                       buckets_path={"counts": threshold_bucket_name},
                       script='params.counts > %d' % threshold)

        if sub_event_data_name:
            b.bucket(sub_event_data_name,
                     'terms',
                     field='winlog.event_data.' + sub_event_data_name,
                     size=sub_bucket_size)

        if self.DebugQuery:
            pprint.pprint(s.to_dict())

        response = s.execute()

        if self.Scan:
            s.scan()
        else:
            response = s.execute()

        return response.aggregations[event_data_name]
Example #16
0
    def get_topics_from_topic_index(self):
        s = Search(using=self.client, index=self.topic_index)

        response = s.scan()
        for hit in response:
            yield hit
Example #17
0
from core.search.cache_data       import cache_paper_info

# Constants
NUM_PAPERS = 1

# Elastic search client
client = Elasticsearch(conf.get("elasticsearch.hostname"))

THRESHOLD_DATE = datetime(2019, 3, 6, 10, 43, 45, 734484) 

# Memory for deleting entries which no longer exist
last_papers = set()

while True:
    # Specify the query
    paper_info_s = Search(index='paper_info', using=client)
    paper_info_s = paper_info_s.sort({ "CreatedDate": { "order": "desc" } })
    paper_info_s = paper_info_s.update_from_dict({ "query": { "bool": { "must_not": [ { "exists": { "field": "FieldsOfStudy" } } ], "must": { "range": { "CreatedDate": { "lt": THRESHOLD_DATE } } } } } })
    paper_info_s = paper_info_s.source(['PaperId'])

    # Get number of query results
    results = paper_info_s[:NUM_PAPERS]
    papers = [x.PaperId for x in results.execute()]

    # Check if the paper has been seen before, and thus needs to be deleted
    checked_papers = last_papers.intersection(set(papers))
    if checked_papers:
        delete_info_s = Search(index='paper_info', using=client)
        delete_info_s = delete_info_s.query("match", PaperId=list(checked_papers))
        delete_info_s.delete()
    last_papers = set(papers).difference(checked_papers)
Example #18
0
 def aggregate(self, keyword):
     s = Search(using=self.client,
                index=self.news_index).query("match", content=keyword)
     response = s.execute()
     for tag in response.aggregations:
         print(tag)
Example #19
0
def search_elastic(term='',
                   user=None,
                   sort='id',
                   order='desc',
                   category='0_0',
                   quality_filter='0',
                   page=1,
                   rss=False,
                   admin=False,
                   logged_in_user=None,
                   per_page=75,
                   max_search_results=1000):
    # This function can easily be memcached now

    es_client = Elasticsearch()

    es_sort_keys = {
        'id': 'id',
        'size': 'filesize',
        # 'name': 'display_name',  # This is slow and buggy
        'comments': 'comment_count',
        'seeders': 'seed_count',
        'leechers': 'leech_count',
        'downloads': 'download_count'
    }

    sort_ = sort.lower()
    if sort_ not in es_sort_keys:
        flask.abort(400)

    es_sort = es_sort_keys[sort]

    order_keys = {'desc': 'desc', 'asc': 'asc'}

    order_ = order.lower()
    if order_ not in order_keys:
        flask.abort(400)

    # Only allow ID, desc if RSS
    if rss:
        sort = es_sort_keys['id']
        order = 'desc'

    # funky, es sort is default asc, prefixed by '-' if desc
    if 'desc' == order:
        es_sort = '-' + es_sort

    # Quality filter
    quality_keys = [
        '0',  # Show all
        '1',  # No remakes
        '2',  # Only trusted
        '3'  # Only completed
    ]

    if quality_filter.lower() not in quality_keys:
        flask.abort(400)

    quality_filter = int(quality_filter)

    # Category filter
    main_category = None
    sub_category = None
    main_cat_id = 0
    sub_cat_id = 0
    if category:
        cat_match = re.match(r'^(\d+)_(\d+)$', category)
        if not cat_match:
            flask.abort(400)

        main_cat_id = int(cat_match.group(1))
        sub_cat_id = int(cat_match.group(2))

        if main_cat_id > 0:
            if sub_cat_id > 0:
                sub_category = models.SubCategory.by_category_ids(
                    main_cat_id, sub_cat_id)
                if not sub_category:
                    flask.abort(400)
            else:
                main_category = models.MainCategory.by_id(main_cat_id)
                if not main_category:
                    flask.abort(400)

    # This might be useless since we validate users
    # before coming into this method, but just to be safe...
    if user:
        user = models.User.by_id(user)
        if not user:
            flask.abort(404)
        user = user.id

    same_user = False
    if logged_in_user:
        same_user = user == logged_in_user.id

    s = Search(using=es_client,
               index=app.config.get('ES_INDEX_NAME'))  # todo, sukebei prefix

    # Apply search term
    if term:
        s = s.query(
            'simple_query_string',
            # Query both fields, latter for words with >15 chars
            fields=['display_name', 'display_name.fullword'],
            analyzer='my_search_analyzer',
            default_operator="AND",
            query=term)

    # User view (/user/username)
    if user:
        s = s.filter('term', uploader_id=user)

        if not admin:
            # Hide all DELETED torrents if regular user
            s = s.filter('term', deleted=False)
            # If logged in user is not the same as the user being viewed,
            # show only torrents that aren't hidden or anonymous.
            #
            # If logged in user is the same as the user being viewed,
            # show all torrents including hidden and anonymous ones.
            #
            # On RSS pages in user view, show only torrents that
            # aren't hidden or anonymous no matter what
            if not same_user or rss:
                s = s.filter('term', hidden=False)
                s = s.filter('term', anonymous=False)
    # General view (homepage, general search view)
    else:
        if not admin:
            # Hide all DELETED torrents if regular user
            s = s.filter('term', deleted=False)
            # If logged in, show all torrents that aren't hidden unless they belong to you
            # On RSS pages, show all public torrents and nothing more.
            if logged_in_user and not rss:
                hiddenFilter = Q('term', hidden=False)
                userFilter = Q('term', uploader_id=logged_in_user.id)
                combinedFilter = hiddenFilter | userFilter
                s = s.filter('bool', filter=[combinedFilter])
            else:
                s = s.filter('term', hidden=False)

    if main_category:
        s = s.filter('term', main_category_id=main_cat_id)
    elif sub_category:
        s = s.filter('term', main_category_id=main_cat_id)
        s = s.filter('term', sub_category_id=sub_cat_id)

    if quality_filter == 0:
        pass
    elif quality_filter == 1:
        s = s.filter('term', remake=False)
    elif quality_filter == 2:
        s = s.filter('term', trusted=True)
    elif quality_filter == 3:
        s = s.filter('term', complete=True)

    # Apply sort
    s = s.sort(es_sort)

    # Only show first RESULTS_PER_PAGE items for RSS
    if rss:
        s = s[0:per_page]
    else:
        max_page = min(page,
                       int(math.ceil(max_search_results / float(per_page))))
        from_idx = (max_page - 1) * per_page
        to_idx = min(max_search_results, max_page * per_page)
        s = s[from_idx:to_idx]

    highlight = app.config.get('ENABLE_ELASTIC_SEARCH_HIGHLIGHT')
    if highlight:
        s = s.highlight_options(tags_schema='styled')
        s = s.highlight("display_name")

    # Return query, uncomment print line to debug query
    # from pprint import pprint
    # print(json.dumps(s.to_dict()))
    return s.execute()
from datetime import datetime

from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
import pandas as pd

# We will create an object for accessing the ElasticSearch instance.
# In this case, we will assume that it is running on our system with its REST interface available in port 9200
# verify_certs will be necessary if you're connecting to an ElasticSearch over TLS (https) with a bad certificate
es = Elasticsearch('http://localhost:9200', verify_certs=False)

# The enriched indexes store one document per commit
# The query builds buckets of commits, grouped by author name,
# aggregated as first commit for each of these authors
s = Search(using=es, index='git')
s.aggs.buckets('by_authors', 'terms', field='author_name',
               size=10000).metric('first_commit', 'min', field='author_date')
s = s.sort('author_date')
Example #21
0
def search(event_name):
    s = Search().filter('term', event_name=event_name)
    response = s.execute()
    return response
Example #22
0
def default_search(param):
    if param is None:
        return Search()
    return param
Example #23
0
def detail(request, id):
    '''
    ip domain 详情
    :param request:
    :param id:
    :return:
    '''
    data = es_search_ip_by_id(id)
    if not data:
        raise Http404
    data = data[0]
    doc_type = data["_type"]
    data = data["_source"]
    data["published_from"] = datetime_string_format(data["published_from"])
    if doc_type == "ips":
        target = data["target"]
        data["proper"] = is_proper(target, "ip")
        # 关联出域名
        union_domains = es_search_domain_by_ip(target, True)
        # 历史ip
        historys = es_search_ip(target)
        for h in historys:
            h["published_from"] = datetime_string_format(h["published_from"])

        # 关联C段ip
        c_data = []
        temp_ips = target.split(".")
        if len(temp_ips) == 4:
            del temp_ips[-1]
            query_ip = '.'.join(temp_ips) + ".*"
            payload = {
                "query": {
                    "wildcard": {"target": query_ip}
                },
                "collapse": {
                    "field": "target"
                },
                "sort": {
                    "published_from": {"order": "desc"}
                },
                "from": 0,
                "size": 10000
            }

            s = Search(using=es, index='w12scan', doc_type='ips').from_dict(payload)
            res = s.execute()
            for hit in res:
                cid = hit.meta.id
                d = hit.to_dict()
                if d["target"] != target:
                    if isinstance(d["target"], list):
                        d["target"] = d["target"][0]
                    # C段ip的上的域名
                    sub_data = []
                    sub_domain = es_search_domain_by_ip(d["target"], True)
                    for sub in sub_domain:
                        dd = {}
                        dd.update(sub)
                        sub_data.append(dd)
                    extrainfo = ""
                    for k in d.get("infos", []):
                        extrainfo += "{0}/{1} ".format(k.get("port", ""), k.get("name", "unknown"))

                    c_data.append({"id": cid, "ip": d["target"], "data": sub_data, "extrainfo": extrainfo})

            # c_data 排序

            c_data.sort(key=lambda a: int(a.get("ip", 0).split(".")[3]))

        return render(request, "frontend/ip_detail.html",
                      {"data": data, "union": union_domains, "c_data": c_data, "third_infomation": third_info(target),
                       "historys": historys})
    elif doc_type == "domains":
        ip = data["ip"]
        target = data["url"]
        data["proper"] = is_proper(target, "domain")

        # 展现信息
        field = ["title", "status_code", "X-Powered-By", "Server"]
        uldata = []
        for f in field:
            if f in data:
                uldata.append((f, data[f]))
        hit = es_search_ip(ip, deduplicat=True)

        historys = es_search_domain_by_url(target)
        for h in historys:
            h["published_from"] = datetime_string_format(h["published_from"])

        # s = Search(using=es, index='w12scan', doc_type='ips').from_dict(payload)
        ip_data = {}
        if hit:
            ip_data["id"] = hit.meta.id
            ip_data["ip"] = list(hit.target)[0]

        # subdomain 获取
        try:
            sub_domain = get_fld(target, fix_protocol=True)
        except:
            sub_domain = None
        sub_domain_data = []
        if sub_domain:
            payload = {"query": {
                "wildcard": {"url": "*." + sub_domain}
            }
                , "collapse": {
                    "field": "url"
                },
                "sort": {
                    "published_from": {"order": "desc"}
                },
                "from": 0,
                "size": 10000
            }
            s = Search(using=es, index='w12scan', doc_type='domains').from_dict(payload)
            for hit in s:
                dd = {}
                dd.update(hit.to_dict())
                if isinstance(dd["url"], list):
                    dd["url"] = dd["url"][0]
                dd["id"] = hit.meta.id
                dd["published_from"] = datetime_string_format(dd["published_from"])
                sub_domain_data.append(dd)

        return render(request, "frontend/domain_detail.html",
                      {"data": data, "ip_data": ip_data, "sub_domain": sub_domain_data,
                       "third_infomation": third_info(ip), "historys": historys, "uldata": uldata})
Example #24
0
    def get_queryset(self):
        if not self.index_manager.connected_to_es:
            messages.warning(self.request,
                             _('Impossible de se connecter à Elasticsearch'))
            return []

        if self.search_query:

            # Searches forums the user is allowed to visit
            self.authorized_forums = get_authorized_forums(self.request.user)

            search_queryset = Search()

            # Restrict (sub)category if any
            if self.search_form.cleaned_data['category']:
                self.content_category = self.search_form.cleaned_data[
                    'category']
            if self.search_form.cleaned_data['subcategory']:
                self.content_subcategory = self.search_form.cleaned_data[
                    'subcategory']

            # Mark that contents must come from library if required
            self.from_library = False
            if self.search_form.cleaned_data['from_library'] == 'on':
                self.from_library = True

            # Setting the different querysets (according to the selected models, if any)
            part_querysets = []
            chosen_groups = self.search_form.cleaned_data['models']

            if chosen_groups:
                models = []
                for group in chosen_groups:
                    if group in settings.ZDS_APP['search']['search_groups']:
                        models.append(settings.ZDS_APP['search']
                                      ['search_groups'][group][1])
            else:
                models = [
                    v[1] for k, v in settings.ZDS_APP['search']
                    ['search_groups'].items()
                ]

            models = reduce(operator.concat, models)

            for model in models:
                part_querysets.append(
                    getattr(self, 'get_queryset_{}s'.format(model))())

            queryset = part_querysets[0]
            for query in part_querysets[1:]:
                queryset |= query

            # Weighting:
            weight_functions = []
            for _type, weights in list(
                    settings.ZDS_APP['search']['boosts'].items()):
                if _type in models:
                    weight_functions.append({
                        'filter': Match(_type=_type),
                        'weight': weights['global']
                    })

            scored_queryset = FunctionScore(query=queryset,
                                            boost_mode='multiply',
                                            functions=weight_functions)
            search_queryset = search_queryset.query(scored_queryset)

            # Highlighting:
            search_queryset = search_queryset.highlight_options(
                fragment_size=150,
                number_of_fragments=5,
                pre_tags=['[hl]'],
                post_tags=['[/hl]'])
            search_queryset = search_queryset.highlight('text').highlight(
                'text_html')

            # Executing:
            return self.index_manager.setup_search(search_queryset)

        return []
Example #25
0
def zc_detail(request, id):
    try:
        m = properly.objects.get(id=id)
    except:
        m = None
    if m is None:
        raise Http404
    # 处理域名资产
    show_data = {}
    domains = m.domains.splitlines()
    show_data['domains'] = domains

    payload = {"query": {
        "bool": {
            "should": [

            ]
        }
    }, "collapse": {
        "field": "url"
    },
        "sort": {
            "published_from": {"order": "desc"}
        },
        "from": 0,
        "size": 10000
    }
    temp_list = []
    for temp in domains:
        if "*" not in temp and not temp.startswith("http"):
            temp = "http*" + temp
        temp_list.append({
            "wildcard": {
                "url": temp
            }
        })
    payload["query"]["bool"]["should"] = temp_list
    domains_data = []
    apps = set()
    if temp_list:
        s = Search(using=es, index='w12scan', doc_type='domains').from_dict(payload)
        for hit in s:
            dd = {}
            dd.update(hit.to_dict())
            dd["id"] = hit.meta.id
            if isinstance(dd["url"], list):
                dd["url"] = dd["url"][0]
            if dd.get("app"):
                apps |= set(dd.get("app"))
            domains_data.append(dd)
    # 从域名中分离出ip加入到ip资产
    temp_ips = set()
    for domain in domains_data:
        ip = domain.get("ip")
        if ip:
            temp_ips.add(ip)
    # 处理IP资产
    ips = m.ips.splitlines()
    show_data["ips"] = ips
    temp_ips |= set(ips)
    temp_list = []
    for temp in temp_ips:
        _ip = temp
        if "*" in _ip:
            temp_list.append({
                "wildcard": {
                    "target": _ip
                }
            })
        elif "/" in _ip:
            try:
                net = ipaddress.ip_network(_ip)
            except Exception as e:
                print(e)
                net = None
            if net:
                for i in net:
                    if i not in temp_ips:
                        temp_list.append({
                            "term": {
                                "target": str(i)
                            }
                        })
        else:
            temp_list.append({
                "term": {
                    "target": _ip
                }
            })

    payload = {"query": {
        "bool": {
            "should": [

            ]
        }
    }, "collapse": {
        "field": "target"
    },
        "sort": {
            "published_from": {"order": "desc"}
        },
        "from": 0,
        "size": 10000
    }
    payload["query"]["bool"]["should"] = temp_list
    ips_data = []
    # ip service name statices
    statics_services = {}

    if temp_list:
        s = Search(using=es, index='w12scan', doc_type='ips').from_dict(payload)
        for hit in s:
            dd = {}
            dd.update(hit.to_dict())
            dd["id"] = hit.meta.id
            if isinstance(dd["target"], list):
                dd["target"] = dd["target"][0]
            ips_data.append(dd)
            # 统计
            if dd.get("infos"):
                for item in dd.get("infos"):
                    name = item.get("name", None)
                    if not name:
                        continue
                    if name not in statics_services:
                        statics_services[name] = 0
                    statics_services[name] += 1

    data_pie = {
        "labels": list(statics_services.keys()),
        "data": list(statics_services.values())
    }

    return render(request, "frontend/zc-detail.html",
                  {"model": m, "domains": domains_data, "show_data": show_data, "apps": apps, "ips": ips_data,
                   "data_pie": data_pie})
Example #26
0
    def search_results(self, request, query_term):
        """ Display results based on search term. """
        is_gene_suggest = False
        if request.method == "GET":
            client = Elasticsearch([ES_HOST], timeout=60)
            search_gene = Search().using(client).doc_type('genes').source(
                exclude=['isoforms.cds', 'isoforms.exons',
                         'GO'])  #'isoforms.cds','GO'])
            if query_term == None:
                studies = Study.objects.all()
                phenotypes = Phenotype.objects.all()
                # Elasticsearch query cannot be made before knowing the ordering and the page number, etc as this is taken into account by elasticsearch.py
            else:
                studies = Study.objects.filter(
                    Q(name__icontains=query_term)
                    | Q(phenotype__trait_ontology_name__icontains=query_term)
                    | Q(phenotype__name__icontains=query_term)
                    | Q(phenotype__description__icontains=query_term)
                    | Q(publication_pmid__icontains=query_term)
                    | Q(publication_pmcid__icontains=query_term)).order_by(
                        'n_hits_perm').reverse()
                phenotypes = Phenotype.objects.filter(
                    Q(name__icontains=query_term)
                    | Q(description__icontains=query_term)).order_by('name')
                # Add chromosome position search for genomic regions
                try:
                    int(query_term)
                    isnum = True
                except ValueError:
                    isnum = False
                import re
                pattern = re.compile(
                    "(Chr|CHR|chr)+\s?([0-9]{1,2})+(-|:)?(\d*)\s*(-|:|)?\s*(\d+|)"
                )
                if isnum:  # Only a number, look for neighboring genes on all chromosomes.
                    q = QES('range',
                            positions={
                                "gte": int(query_term),
                                'lte': int(query_term)
                            })
                    search_gene = search_gene.query(q)
                elif pattern.match(query_term):  # Specific genomic range
                    splitted = re.split(
                        "(Chr|CHR|chr)+\s?([0-9]{1,2})+(-|:)?(\d*)\s*(-|:|)?\s*(\d+|)",
                        query_term)
                    chr = int(splitted[2])
                    s_p = None
                    e_p = None
                    if splitted[4]:
                        s_p = int(splitted[4])
                    if splitted[6]:
                        e_p = int(splitted[6])
                    # Need to retrieve all genes that overlap somehow with that region (all-in, right part in, left part in, etc)
                    q = QES('match', chr='chr' + str(chr))
                    search_gene = search_gene.query(q)
                    if s_p:
                        if e_p:
                            # Look for genes overlapping with region of interest
                            q = QES('range',
                                    positions={
                                        'gte': s_p,
                                        'lte': e_p
                                    }) | QES('range',
                                             positions={
                                                 'gte': s_p,
                                                 'lte': s_p
                                             }) | QES('range',
                                                      positions={
                                                          'gte': e_p,
                                                          'lte': e_p
                                                      })
                        else:
                            q = QES('range',
                                    positions={
                                        'gte': s_p,
                                        'lte': s_p
                                    }) | QES('range', positions={'gte': s_p})
                        search_gene = search_gene.query(q)
                else:  # other type of request
                    is_gene_suggest = True
                    search_gene = search_gene.suggest('gene_suggest',
                                                      query_term,
                                                      completion={
                                                          'field': 'suggest',
                                                          'size': 200
                                                      })
            # custom ordering
            ordering = request.query_params.get('ordering', None)
            ordering_fields = {
                'studies':
                ['name', 'genotype', 'phenotype', 'method', 'transformation'],
                'phenotypes': ['name', 'description'],
                'genes': [
                    'name', 'chr', 'start', 'end', 'SNPs_count',
                    'association_count', 'description'
                ]
            }
            if ordering is not None:
                from django.db.models.functions import Lower
                inverted = False
                if ordering.startswith('-'):
                    inverted = True
                    ordering = ordering[1:]
                if ordering in ordering_fields['studies'] and studies:
                    if ordering == 'phenotype' or ordering == 'genotype':  # Need to reference the names and not the internal IDs for ordering
                        ordering += '__name'
                    studies = studies.order_by(Lower(ordering)).reverse()
                    if inverted:
                        studies = studies.reverse()
                if ordering in ordering_fields['phenotypes'] and phenotypes:
                    phenotypes = phenotypes.order_by(Lower(ordering))
                    if inverted:
                        phenotypes = phenotypes.reverse()
                if ordering in ordering_fields['genes']:
                    # if ordering == 'snp' or ordering == 'study':
                    #     ordering += '__name'
                    # genes = genes.order_by(Lower(ordering))
                    if ordering == 'start' or ordering == 'end':
                        ordering += '_position'
                    if inverted:
                        ordering = "-" + ordering
                    search_gene.sort(ordering)

            n_genes = search_gene.count()
            if studies:
                pagest = self.paginate_queryset(studies)
                study_serializer = StudySerializer(pagest, many=True)
            else:
                study_serializer = StudySerializer(studies, many=True)

            if n_genes:
                size = min(200, search_gene.count())
                if is_gene_suggest:
                    size = 0
                results = search_gene[0:size].execute()
                if is_gene_suggest:
                    genes = results.to_dict(
                    )['suggest']['gene_suggest'][0]['options']
                else:
                    genes = results.to_dict()['hits']['hits']
                genes_out = []
                for gene in genes:
                    genes_out.append(gene["_source"])
                pagege = self.paginate_queryset(genes_out)
            else:
                genes = []
                pagege = []

            if phenotypes:
                pagephe = self.paginate_queryset(phenotypes)
                phenotype_serializer = PhenotypeListSerializer(pagephe,
                                                               many=True)
            else:
                phenotype_serializer = PhenotypeListSerializer(phenotypes,
                                                               many=True)

            counts = [len(genes), len(phenotypes), len(studies)]
            PAGE_SIZE = 25.
            import math
            page_counts = [
                int(math.ceil(float(len(genes)) / PAGE_SIZE)),
                int(math.ceil(float(len(phenotypes)) / PAGE_SIZE)),
                int(math.ceil(float(len(studies)) / PAGE_SIZE))
            ]
            data = {
                'study_search_results': study_serializer.data,
                'phenotype_search_results': phenotype_serializer.data,
                'gene_search_results': pagege,
                'counts': counts,
                'page_counts': page_counts
            }

            if any([studies, genes, phenotypes]):
                return self.get_paginated_response(data)
            else:
                return Response({
                    'results': {i: data[i]
                                for i in data if i != 'counts'},
                    'count': counts,
                    'page_count': [0, 0, 0]
                })
Example #27
0
 def _Search(self, indexname):
   """
   it returns the object which can be used for reatriving ceratin value from the DB
   """
   return Search(using=self.__client, index=indexname)
Example #28
0
def analyze_git(es_read, es_write, es_read_index, es_write_index, git_enrich,
                size, incremental):

    query = {"match_all": {}}
    sort = [{"metadata__timestamp": {"order": "asc"}}]

    if incremental.lower() == 'true':
        search = Search(using=es_write, index=es_write_index)
        # from:to parameters (=> from: 0, size: 0)
        search = search[0:0]
        search = search.aggs.metric('max_date',
                                    'max',
                                    field='metadata__timestamp')

        try:
            response = search.execute()

            if response.to_dict()['aggregations']['max_date']['value'] is None:
                msg = "No data for 'metadata__timestamp' field found in "
                msg += es_write_index + " index"
                logging.warning(msg)
                init_write_index(es_write, es_write_index)

            else:
                # Incremental case: retrieve items from last item in ES write index
                max_date = response.to_dict(
                )['aggregations']['max_date']['value_as_string']
                max_date = date_parser.parse(max_date).isoformat()

                logging.info("Starting retrieval from: " + max_date)
                query = {"range": {"metadata__timestamp": {"gte": max_date}}}

        except NotFoundError:
            logging.warning("Index not found: " + es_write_index)
            init_write_index(es_write, es_write_index)

    else:
        init_write_index(es_write, es_write_index)

    search_query = {"query": query, "sort": sort}

    logging.info(search_query)

    logging.info("Start reading items...")

    commits = []
    cont = 0

    for hit in helpers.scan(es_read,
                            search_query,
                            scroll='300m',
                            index=es_read_index,
                            preserve_order=True):

        cont = cont + 1

        item = hit["_source"]
        commits.append(item)
        logging.debug("[Hit] metadata__timestamp: " +
                      item['metadata__timestamp'])

        if cont % size == 0:
            logging.info("Total Items read: " + str(cont))

            events_df = eventize_and_enrich(commits, git_enrich)
            upload_data(events_df, es_write_index, es_write)

            commits = []
            events_df = None

    # In case we have some commits pending, process them
    if len(commits) > 0:
        logging.info("Total Items read: " + str(cont))
        events_df = eventize_and_enrich(commits, git_enrich)
        upload_data(events_df, es_write_index, es_write)
Example #29
0
File: lab4.py Project: qc-an/CAI
if __name__ == '__main__':
    # Selection of the arguments.
    index, nrounds, R, k, alpha, beta, query = parse_and_select()

    #If the user introduced the agrument (--query) but not the terms to search.
    if not query:
        print('No query parameters passed!')
        raise SystemExit
    else:
        query_dict = ini_query_dic(query)

    try:
        # Start of elasticsearch
        client = Elasticsearch()
        s = Search(using=client, index=index)

        # The query is solved and, using the k most relevant documents retrieved,
        # the query is updated using the Rocchio's rule nrounds.
        for _ in range(nrounds):
            #Creation of the query.
            q = Q('query_string', query=query[0])
            for i in range(1, len(query)):
                q &= Q('query_string', query=query[i])

            s = s.query(q)
            # We select the k most relevant documents.
            response = s[0:k].execute()

            # We compute the TFIDF representation for each document and
            #  store them in a list
Example #30
0
def _find(params,
          total_only=False,
          make_suggestions=False,
          min_suggestion_score=0.8):
    search_query = Search(index=settings.SEARCH_INDEX_NAME, )
    if make_suggestions:
        # XXX research if it it's better to use phrase suggesters and if
        # that works
        # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters.html#phrase-suggester
        search_query = search_query.suggest("title_suggestions",
                                            params["query"],
                                            term={"field": "title"})
        search_query = search_query.suggest("body_suggestions",
                                            params["query"],
                                            term={"field": "body"})

    sub_queries = []
    sub_queries.append(
        Q("match", title={
            "query": params["query"],
            "boost": 2.0
        }))
    sub_queries.append(
        Q("match", body={
            "query": params["query"],
            "boost": 1.0
        }))
    if " " in params["query"]:
        sub_queries.append(
            Q("match_phrase", title={
                "query": params["query"],
                "boost": 10.0
            }))
        sub_queries.append(
            Q("match_phrase", body={
                "query": params["query"],
                "boost": 5.0
            }))

    sub_query = query.Bool(should=sub_queries)

    if params["locales"]:
        search_query = search_query.filter("terms", locale=params["locales"])
    if params["archive"] == "exclude":
        search_query = search_query.filter("term", archived=False)
    elif params["archive"] == "only":
        search_query = search_query.filter("term", archived=True)

    search_query = search_query.highlight_options(
        pre_tags=["<mark>"],
        post_tags=["</mark>"],
        number_of_fragments=3,
        fragment_size=120,
        encoder="html",
    )
    search_query = search_query.highlight("title", "body")

    if params["sort"] == "relevance":
        search_query = search_query.sort("_score", "-popularity")
        search_query = search_query.query(sub_query)
    elif params["sort"] == "popularity":
        search_query = search_query.sort("-popularity", "_score")
        search_query = search_query.query(sub_query)
    else:
        popularity_factor = 10.0
        boost_mode = "sum"
        score_mode = "max"
        search_query = search_query.query(
            "function_score",
            query=sub_query,
            functions=[
                query.SF(
                    "field_value_factor",
                    field="popularity",
                    factor=popularity_factor,
                    missing=0.0,
                )
            ],
            boost_mode=boost_mode,
            score_mode=score_mode,
        )

    search_query = search_query.source(excludes=["body"])

    search_query = search_query[params["size"] *
                                (params["page"] - 1):params["size"] *
                                params["page"]]

    retry_options = {
        "retry_exceptions": (
            # This is the standard operational exception.
            exceptions.ConnectionError,
            # This can happen if the search happened right as the index had
            # just been deleted due to a fresh re-indexing happening in Yari.
            exceptions.NotFoundError,
            # This can happen when the index simply isn't ready yet.
            exceptions.TransportError,
        ),
        # The default in redo is 60 seconds. Let's tone that down.
        "sleeptime":
        settings.ES_RETRY_SLEEPTIME,
        "attempts":
        settings.ES_RETRY_ATTEMPTS,
        "jitter":
        settings.ES_RETRY_JITTER,
    }
    with retrying(search_query.execute, **retry_options) as retrying_function:
        response = retrying_function()

    if total_only:
        return response.hits.total

    metadata = {
        "took_ms": response.took,
        "total": {
            # The `response.hits.total` is a `elasticsearch_dsl.utils.AttrDict`
            # instance. Pluck only the exact data needed.
            "value": response.hits.total.value,
            "relation": response.hits.total.relation,
        },
        "size": params["size"],
        "page": params["page"],
    }
    documents = []
    for hit in response:
        try:
            body_highlight = list(hit.meta.highlight.body)
        except AttributeError:
            body_highlight = []
        try:
            title_highlight = list(hit.meta.highlight.title)
        except AttributeError:
            title_highlight = []

        d = {
            "mdn_url": hit.meta.id,
            "score": hit.meta.score,
            "title": hit.title,
            "locale": hit.locale,
            "slug": hit.slug,
            "popularity": hit.popularity,
            "archived": hit.archived,
            "summary": hit.summary,
            "highlight": {
                "body": body_highlight,
                "title": title_highlight,
            },
        }
        documents.append(d)

    try:
        suggest = getattr(response, "suggest")
    except AttributeError:
        suggest = None

    suggestions = []
    if suggest:
        suggestion_strings = _unpack_suggestions(
            params["query"],
            response.suggest,
            ("body_suggestions", "title_suggestions"),
        )

        for score, string in suggestion_strings:
            if score > min_suggestion_score or 1:
                # Sure, this is different way to spell, but what will it yield
                # if you actually search it?
                total = _find(params, total_only=True)
                if total["value"] > 0:
                    suggestions.append({
                        "text": string,
                        "total": {
                            # This 'total' is an `AttrDict` instance.
                            "value": total.value,
                            "relation": total.relation,
                        },
                    })
                    # Since they're sorted by score, it's usually never useful
                    # to suggestion more than exactly 1 good suggestion.
                    break

    return {
        "documents": documents,
        "metadata": metadata,
        "suggestions": suggestions,
    }