def _list_columns(self, experiment_id: str, stages: List[LifecycleStage], column_type: str, columns: List[str], size: int = 100) -> None: s = Search(index="mlflow-runs").filter("match", experiment_id=experiment_id) \ .filter("terms", lifecycle_stage=stages) s.aggs.bucket(column_type, 'nested', path=column_type) \ .bucket(f'{column_type}_keys', "composite", size=size, sources=[{"key": {"terms": {"field": f'{column_type}.key'}}}]) response = s.params(size=0).execute() new_columns = [ column.key.key for column in attrgetter( f'aggregations.{column_type}.{column_type}_keys.buckets')( response) ] columns += new_columns while (len(new_columns) == size): last_col = attrgetter( f'aggregations.{column_type}.{column_type}_keys.after_key.key' )(response) s = Search(index="mlflow-runs").filter("match", experiment_id=experiment_id) \ .filter("terms", lifecycle_stage=stages) s.aggs.bucket(column_type, 'nested', path=column_type) \ .bucket(f'{column_type}_keys', "composite", size=size, sources=[{"key": {"terms": {"field": f'{column_type}.key'}}}], after={"key": last_col}) response = s.params(size=0).execute() new_columns = [ column.key.key for column in attrgetter( f'aggregations.{column_type}.{column_type}_keys.buckets')( response) ] columns += new_columns
def doctor(doc): for doc_type in doc: start, limit, i, err = 0, 500, 0, 0 while True: try: s = Search(using=client, index="doctor-%s" % doc_type).sort() s = s[start:start+limit] s.params(scroll="1024M") res = s.execute() if s.count() == 0: break for hit in res: i += 1 print("%s--%s--%s" % (doc_type, i, hit.name)) d = Doctor.nodes.get_or_none(did=hit.document_id) if d: continue data = hit.to_dict() goodat = data.get('goodat', None) description = data.get('description', None) sex = data.get('sex', None), d = Doctor( did=data['document_id'], name=data['name'], goodat="".join(goodat.split()) if goodat else None, sex=''.join(sex.split()) if sex and isinstance(sex, str) else None, description=''.join(description.split()) if description and isinstance(description, str) else None, title=data.get('title', None), sourceUrl=data.get('source_url', None), sourceType=data.get('document_type'), headerUrl=data.get('headerUrl', None), ).save() hs = data.get('hospitals', []) deps = [] for h in hs: deps.extend(h['departments']) hos = Hospital.nodes.get_or_none(hid=h['hospital_id']) if hos: d.hospital.connect(hos, { 'department': ','.join(h['departments']) }) if deps: department = getDepartment(deps[0]) if department: d.department.connect(department) province = getProvince(data.get('province', None)) city = getCity(data.get('city', None), province) if province: d.province.connect(province) if city: d.city.connect(city) d.save() del data start += limit except Exception as e: print(e) err += 1 if err > 10: break
def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None: """ Queries fatcat search index (the full regular fatcat.wiki release index) for search string passed (and some filters), iterates over the result set (using scroll), and fetches full release entity (via api.fatcat.wik) for each. TODO: group by work_id """ api_session = requests_retry_session() es_backend = os.environ.get( "ELASTICSEARCH_FATCAT_BASE", "https://search.fatcat.wiki" ) es_index = os.environ.get("ELASTICSEARCH_FATCAT_RELEASE_INDEX", "fatcat_release") es_client = elasticsearch.Elasticsearch(es_backend) search = Search(using=es_client, index=es_index) search = search.exclude("terms", release_type=["stub", "component", "abstract"]) # "Emerald Expert Briefings" search = search.exclude("terms", container_id=["fnllqvywjbec5eumrbavqipfym"]) # ResearchGate search = search.exclude("terms", doi_prefix=["10.13140"]) if fulltext_only: search = search.filter("terms", in_ia=True) search = search.query( Q("query_string", query=query, default_operator="AND", fields=["biblio"]) ) print(f"Expecting {search.count()} search hits", file=sys.stderr) search = search.params(clear_scroll=False) search = search.params(_source=False) results = search.scan() for hit in results: release_id = hit.meta.id resp = api_session.get( f"https://api.fatcat.wiki/v0/release/{release_id}", params={ "expand": "container,files,filesets,webcaptures", "hide": "references", }, ) resp.raise_for_status() row = dict( fatcat_hit=hit.meta._d_, release_id=release_id, fatcat_release=resp.json(), ) print(json.dumps(row, sort_keys=True), file=json_output)
def pr_links_query(paper_ids): ''' Get properties of a paper. ''' # Elastic search client client = Elasticsearch(conf.get("elasticsearch.hostname")) # Targets pr_targets = ['PaperId', 'PaperReferenceId', 'FieldOfStudyId'] # Query results references = list() citations = list() fieldsofstudy = list() # Result dictionary results = dict() for paper_id in paper_ids: results[paper_id] = { 'References': [], 'Citations': [], 'FieldsOfStudy': [] } # Query for paper references ref_s = Search(index='paperreferences', using=client) ref_s = ref_s.query('terms', PaperId=paper_ids) ref_s = ref_s.params(request_timeout=TIMEOUT) # Convert into dictionary format for ref_info in ref_s.scan(): results[ref_info[pr_targets[0]]]['References'].append( ref_info[pr_targets[1]]) # Query for paper citations cit_s = Search(index='paperreferences', using=client) cit_s = cit_s.query('terms', PaperReferenceId=paper_ids) cit_s = cit_s.params(request_timeout=TIMEOUT) # Convert into dictionary format for cit_info in cit_s.scan(): results[cit_info[pr_targets[1]]]['Citations'].append( cit_info[pr_targets[0]]) # Query for paper fields of study fos_s = Search(index='paperfieldsofstudy', using=client) fos_s = fos_s.query('terms', PaperId=paper_ids) fos_s = fos_s.params(request_timeout=TIMEOUT) # Convert into dictionary format for fos_info in fos_s.scan(): results[fos_info[pr_targets[0]]]['FieldsOfStudy'].append( fos_info[pr_targets[2]]) # Return results as a dictionary return results
class AllSearchDocumentViewSet(BaseSearchDocumentViewSet): document = ActionDocument # This needs to be filled with a valid Document serializer_class = ( ActionSearchSerializer # This needs to be filled with a valid Serializer ) def __init__(self, *args, **kwargs): super(AllSearchDocumentViewSet, self).__init__(*args, **kwargs) self.search = Search( using=self.client, index=list(settings.ELASTICSEARCH_INDEX_NAMES.values()), doc_type=self.document._doc_type.name, ).sort(*self.ordering) self.search.params(preserve_order=False)
def get_indices(self, docTypes: List = ["default"]) -> str: """ Returns a list of all indexes for the given doc types. :param docTypes: List of Doctypes to search, if empty will search all docTypes :return: A string representing indexes to search. (will use * to regroup multiple indices) """ es = get_es_conn() indexNamesStr = "" if docTypes: s = Search(using=es, index=self.typeIndex, doc_type="directory_type").query("ids", values=docTypes) s = s.params(scroll=get_scan_scroll_duration(), size=get_nb_documents_per_scan_scroll()) indexNamesQuery = s.source(["indexName"]) indexNamesArr = [] for indexNamePart in indexNamesQuery.scan(): indexNamesArr.append(indexNamePart["indexName"]) indexNamesStr = ','.join(indexNamesArr) else: indexNamesStr = self.dataIndexPrefix + "*" return indexNamesStr
def es_get_papers_fos(paperids): s = Search(using=client, index="paperfieldsofstudy") s = s.query("terms", PaperId=paperids) s = s.params(size=500) response = s.execute() result = response.to_dict()["hits"]["hits"] return result
def count(self, timeout=30): s = Search(using=self.es_client, index=self.resource_index) s = s.filter('term', **{'resource.keyword': 'gpc'}) # Only count completed scans. s = s.filter('terms', **{'status.keyword': ['ok', 'failed']}) # Only count base domains. s = s.filter('term', **{'is_base_domain': True}) # Don't need any actual results - just the count and aggregations. s = s[0:0] # Use aggregation to count subset that reports support. supporting_filters = [{ 'term': { 'scan_data.found': True } }, { 'term': { 'scan_data.gpc.parsed.gpc': True } }] s.aggs.bucket('supporting', 'filter', bool={'filter': supporting_filters}) s = s.extra(track_total_hits=True) s = s.params(request_timeout=timeout) response = s.execute() supporting_count = response.aggregations.supporting.doc_count return response.hits.total.value, supporting_count
def es_get_fos_level(fosids): s = Search(using=client, index="fieldsofstudy") s = s.query("terms", FieldOfStudyId=fosids) s = s.params(size=500) response = s.execute() result = response.to_dict()["hits"]["hits"] return result
def find_tweetable(self, limit=10, timeout=30): s = Search(using=self.es_client, index=self.resource_index) s = s.filter('term', **{'resource.keyword': 'gpc'}) # Only tweet about sites where the last scan succeded, a gpc.json was # found, and it indicates support for GPC. s = s.filter('term', **{'status.keyword': 'ok'}) s = s.filter('term', **{'scan_data.found': True}) s = s.filter('term', **{'scan_data.gpc.parsed.gpc': True}) # Only tweet about base domains, not subdomains. s = s.filter('term', **{'is_base_domain': True}) # Don't tweet about sites we're previously tweeted about (or may have). # We may have set `tweeting` and failed before we could set `tweeted`. In this case, it's # unclear if the tweet went out or not - needs to be checked manually. s = s.exclude('term', **{'gpcsup.tweeting': True}) s = s.exclude('term', **{'gpcsup.tweeted': True}) s = s.sort('update_dt') s = s[:limit] s = s.params(request_timeout=timeout) response = s.execute() return [r.domain for r in response]
def es_filter_papers_grant_range(paperids, ts, te): Q = { "bool": { "must": [{ "terms": { "PaperId": paperids } }, { "range": { "date": { "gte": ts, "lte": te } } }] } } s = Search(using=client, index="papers") s = s.params(size=1000) s = s.query(Q) response = s.execute() result = response.to_dict()["hits"]["hits"] data = [] if result: data = [r["_source"]["PaperId"] for r in result] else: print("[es_filter_papers_grant_range] no result") return data
def find(self, sort=None, offset=0, limit=10, count=False, timeout=30, **filter_params): s = Search(using=self.es_client, index=self.resource_index) s = s.filter('term', **{'resource.keyword': 'gpc'}) s = apply_filters(s, **filter_params) sort = build_sort(sort) if sort: s = s.sort(*sort) s = s[offset:offset + limit] s = s.params(request_timeout=timeout) if count: s = s.extra(track_total_hits=count) response = s.execute() sites = [(r.to_dict(), r.meta.score) for r in response] return response.hits.total.value, sites
def browse_by_provider(provider, index, page_size, ip, request, filter_dead, page=1, lt=None, li=None): """ Allow users to browse image collections without entering a search query. """ _validate_provider(provider) s = Search(index=index) s = s.params(preference=str(ip)) provider_filter = Q('term', provider=provider.lower()) s = s.filter('bool', should=provider_filter, minimum_should_match=1) licenses = lt if lt else li s = _filter_licenses(s, licenses) start_slice, end_slice = _get_query_slice(s, page_size, page) s = s[start_slice:end_slice] search_response = s.execute() results = _post_process_results( s, start_slice, end_slice, page_size, search_response, request, filter_dead ) result_count, page_count = _get_result_and_page_count( search_response, results, page_size ) return results, page_count, result_count
def _get_metrics(es, field_name): search = Search(using=es, index=current_app.config['INDEX_NAME']) # Traverse down the nesting levels from the root field, until we reach the leaf. # Need to traverse until the root, because we have to build the search object # by adding Nested aggregations consecutively. For example, a nested "samples.foo" # field will result in: # Search(...).bucket('samples', Nested(path='samples')).metric(...) parts = field_name.split('.') bucket = search.aggs parent = '' nestings = [] for part in parts: parent = '%s.%s' % (parent, part) if parent else part if parent in current_app.config['NESTED_PATHS']: bucket = bucket.bucket(parent, Nested(path=parent)) nestings.append(parent) bucket.metric('max', Max(field=field_name)) bucket.metric('min', Min(field=field_name)) bucket.metric('cardinality', Cardinality(field=field_name)) aggs = search.params(size=0).execute().aggregations.to_dict() for nesting in nestings: aggs = aggs.get(nesting) return (aggs['min']['value'], aggs['max']['value'], aggs['cardinality']['value'])
def get_all_ids(index=None, id_field='recid', last_updated=None, latest_first=False): """Get all record or inspire ids of publications in the search index :param index: name of index to use. :param id_field: elasticsearch field to return. Should be 'recid' or 'inspire_id' :return: list of integer ids """ if id_field not in ('recid', 'inspire_id'): raise ValueError('Invalid ID field %s' % id_field) search = Search(using=es, index=index) \ .filter("term", doc_type=CFG_PUB_TYPE) \ .source(fields=[id_field]) if last_updated: search = search.filter("range", **{'last_updated': {'gte': last_updated.isoformat()}}) if latest_first: search = search.sort({'last_updated' : {'order' : 'desc'}}) else: search = search.sort('recid') search = search.params(preserve_order=True) return [int(h[id_field]) for h in search.scan()]
def es_author_normalize(name): name = name.replace("-", "") name = name.replace("'", "") s = Search(using=client, index="authors") s = s.query("match", NormalizedName=name) s = s.params(size=500) response = s.execute() result = response.to_dict()["hits"]["hits"] sorted_list = sorted([s["_source"] for s in result if s["_score"] > 16], key=itemgetter("Rank")) if len(sorted_list) == 0: sorted_list = sorted( [s["_source"] for s in result if s["_score"] > 13], key=itemgetter("Rank")) if len(sorted_list) == 0: sorted_list = sorted([s["_source"] for s in result], key=itemgetter("Rank")) sorted_list = sorted(sorted_list, key=itemgetter("PaperCount"), reverse=True) # print(name) # print(sorted_list) data = {} try: data = sorted_list[0] except Exception as e: print("[es_author_normalize] no result", name, e) return data
def get_update_list_single_process(self): """ Find units that needs updating and their sidstopdateret (last updated) the sidstopdateret may be inaccurate and thus way to far back in time therefore we cannot use take the largest of sidstopdateret from the database. Seems we download like 600 dicts a second with match_all. Should take around 2 hours and 30 minuttes then. This takes 30 so i need to save half an hour on downloads. :return datetime (min sidstopdateret), list (enhedsnumer, sidstopdateret) """ enh_samtid_map = self.make_samtid_dict() oldest_sidstopdateret = datetime.datetime.utcnow().replace( tzinfo=pytz.utc) + datetime.timedelta(days=1) update_dicts = { x: { 'units': [], 'sidstopdateret': oldest_sidstopdateret } for x in self.source_keymap.values() } if len(enh_samtid_map) == 0: return update_dicts dummy = CvrConnection.update_info(samtid=-1, sidstopdateret=self.dummy_date) print('Get update time for all data') for _type in self.source_keymap.values(): search = Search(using=self.elastic_client, index=self.index) search = search.query('match_all') sidst_key = '{0}.sidstOpdateret'.format(_type) samt_key = '{0}.samtId'.format(_type) field_list = ['_id', sidst_key, samt_key] # field_list = ['_id'] + ['{0}.sidstOpdateret'.format(key) for key in self.source_keymap.values()] + \ # ['{0}.samtId'.format(key) for key in self.source_keymap.values()] search = search.fields(fields=field_list) params = {'scroll': self.elastic_search_scroll_time, 'size': 2**12} search = search.params(**params) print('ElasticSearch Query: ', search.to_dict()) generator = search.scan() for cvr_update in tqdm.tqdm(generator): enhedsnummer = int(cvr_update.meta.id) raw_dat = cvr_update.to_dict() samtid = raw_dat[samt_key][0] if samt_key in raw_dat else None sidstopdateret = raw_dat[sidst_key][ 0] if sidst_key in raw_dat else None if sidstopdateret is None or samtid is None: continue current_update = enh_samtid_map[ enhedsnummer] if enhedsnummer in enh_samtid_map else dummy if samtid > current_update.samtid: utc_sidstopdateret = utc_transform(sidstopdateret) update_dicts[_type]['sidstopdateret'] = min( utc_sidstopdateret, update_dicts[_type]['sidstopdateret']) update_dicts[_type]['units'].append( (enhedsnummer, utc_sidstopdateret)) # break print('Update Info: ') print([(k, v['sidstopdateret'], len(v['units'])) for k, v in update_dicts.items()]) return update_dicts
def get_asset_names(self, start): s = Search(using='objects', index="objects-asset") \ .query('prefix', symbol__keyword=start) \ .source(['symbol']) s = s.params(clear_scroll=False) # Avoid calling DELETE on ReadOnly apis. asset_names = [ hit.symbol for hit in s.scan()] return asset_names
def get_accounts(self, account_ids, size=1000): s = Search(using='objects', index="objects-account", extra={'size': size }) s = s.filter('terms', id=account_ids) s = s.source([ 'id', 'name', 'options.voting_account']) s = s.params(clear_scroll=False) # Avoid calling DELETE on ReadOnly apis. accounts = [hit.to_dict() for hit in s.scan()] return accounts
def get_asset_ids(self): s = Search(using='objects', index="objects-asset") \ .query('match_all') \ .source(['id']) s = s.params(clear_scroll=False) # Avoid calling DELETE on ReadOnly apis. asset_ids = [ hit.id for hit in s.scan()] return asset_ids
def paper_info_cache_query( paper_ids, batch_size=DEFAULT_BATCH, query_filter=None): """ Gets paper info from cache. """ start = datetime.now() # Query results complete_info = list() partial_info = list() seen = set() # Query for paper info paper_info_s = Search(index = 'paper_info', using = client) paper_info_s = paper_info_s.filter('terms', _id = paper_ids) paper_info_s = paper_info_s.params(size=DEFAULT_BATCH) if query_filter is not None: paper_info_s = paper_info_s.query(query_filter) # Convert query into dictionary format for paper_info in paper_info_s.scan(): paper_info_res = paper_info.to_dict() # Remove the creation date for query field_del(paper_info_res, 'CreatedDate') # Check the type of the result if 'FieldsOfStudy' not in paper_info_res: continue if paper_info_res['cache_type'] == 'partial': # if paper_info_res['cache_type'] == 'partial': partial_info.append(paper_info_res) else: skip = False for ref in paper_info_res['References']: if 'FieldsOfStudy' not in ref: skip = True continue for cit in paper_info_res['Citations']: if 'FieldsOfStudy' not in cit: skip = True continue if skip: continue complete_info.append(paper_info_res) del paper_info_res['cache_type'] # Add to seen set seen.add(paper_info_res['PaperId']) print(batch_size, datetime.now() - start) # Check for no results and return return {'complete': complete_info, 'partial': partial_info, 'missing': set(paper_ids) - seen}
def es_issue_count(es_client: Any, container_id: str, year: int, volume: str, issue: str) -> int: search = Search(using=es_client, index="fatcat_release") search = (search.filter("term", container_id=container_id).filter( "term", year=year).filter("term", volume=volume).filter( "term", issue=issue).extra(request_cache=True)) search = search.params(request_cache="true") return search.count()
def get_elastic_search_coverage(query: ReleaseQuery) -> dict: search = Search(using=app.es_client, index=app.config["ELASTICSEARCH_RELEASE_INDEX"]) search = search.query( "query_string", query=query.q, default_operator="AND", analyze_wildcard=True, allow_leading_wildcard=False, lenient=True, fields=["biblio"], ) search.aggs.bucket( "preservation", "terms", field="preservation", missing="_unknown", ) if query.recent: date_today = datetime.date.today() start_date = str(date_today - datetime.timedelta(days=60)) end_date = str(date_today + datetime.timedelta(days=1)) search = search.filter("range", release_date=dict(gte=start_date, lte=end_date)) search = search[:0] search = search.params(request_cache=True) search = search.params(track_total_hits=True) resp = wrap_es_execution(search) preservation_bucket = agg_to_dict(resp.aggregations.preservation) preservation_bucket["total"] = _hits_total_int(resp.hits.total) for k in ("bright", "dark", "shadows_only", "none"): if k not in preservation_bucket: preservation_bucket[k] = 0 if app.config["FATCAT_MERGE_SHADOW_PRESERVATION"]: preservation_bucket["none"] += preservation_bucket["shadows_only"] preservation_bucket["shadows_only"] = 0 stats = { "total": _hits_total_int(resp.hits.total), "preservation": preservation_bucket, } return stats
def pfos_prop_query(paper_ids): ''' Get properties of a paper. ''' # Elastic search client client = Elasticsearch(conf.get("elasticsearch.hostname")) # Targets pfos_targets = ['PaperId', 'FieldOfStudyId'] # Query for paper affiliation pfos_s = Search(index='paperfieldsofstudy', using=client) pfos_s = pfos_s.query('terms', PaperId=paper_ids) pfos_s = pfos_s.source(pfos_targets) pfos_s = pfos_s.params(request_timeout=TIMEOUT) # Convert paa into dictionary format results = dict() fos_ids = set() for pfos in pfos_s.scan(): pfos_res = pfos.to_dict() # Get fields paper_id = pfos_res['PaperId'] del pfos_res['PaperId'] # Author if 'FieldOfStudyId' in pfos_res: fos_ids.add(pfos_res['FieldOfStudyId']) # Aggregate results if paper_id in results: results[paper_id].append(pfos_res) else: results[paper_id] = [pfos_res] fos_names, fos_levels = fos_name_level_dict_query(list(fos_ids)) res = dict() for p_id, pfos_info_list in results.items(): pfos_res = list() for pfos_info in pfos_info_list: if 'FieldOfStudyId' in pfos_info: if pfos_info['FieldOfStudyId'] in fos_names: pfos_info['FieldOfStudyName'] = fos_names[ pfos_info['FieldOfStudyId']] pfos_info['FieldOfStudyLevel'] = fos_levels[ pfos_info['FieldOfStudyId']] else: continue pfos_res.append(pfos_info) res[p_id] = pfos_res # Return as dictionary return res
def es_get_paper_conf_year(confid, year): s = Search(using=client, index="papers") s = s.query( Q('bool', must=[Q('match', ConferenceSeriesId=confid), Q('match', Year=year)])) s = s.params(size=500) response = s.execute() result = response.to_dict()["hits"]["hits"] return result
def es_search_papers_from_confid(confid, papercnt): s = Search(using=client, index="papers") \ .query("match", ConferenceSeriesId=confid) s = s.params(preserve_order=True) data = [] for position, hit in enumerate(s.scan()): if position == papercnt: break data.append(hit.to_dict()) return data
def es_search_aff_info_from_pid(paperid): s = Search(using=client, index="paperauthoraffiliations") s = s.query("match", PaperId=paperid) s = s.params(size=1000) response = s.execute() result = response.to_dict()["hits"]["hits"] data = [] if result: data = [res["_source"] for res in result] else: print("[es_search_aff_info_from_pid] no result", authorid) return data
def es_get_paper_fos(paperid): s = Search(using=client, index="paperfieldsofstudy") s = s.query("match", PaperId=paperid) s = s.params(size=500) response = s.execute() result = response.to_dict()["hits"]["hits"] data = [] if result: data = [res["_source"] for res in result] else: print("[es_get_paper_fos] no result", paperid) return data
def fos_name_query(fos_ids): if not fos_ids: return [] fos_target = 'NormalizedName' # Query for paa fos_s = Search(index='fieldsofstudy', using=client) fos_s = fos_s.query('terms', FieldOfStudyId=fos_ids) fos_s = fos_s.source(fos_target) fos_s = fos_s.params(request_timeout=30) return list(map(itemgetter(fos_target), fos_s.scan()))
def download_all_data_to_file(self, filename): """ :return: str: filename, datetime: download time, bool: new download or use old file """ params = { 'scroll': self.elastic_search_scroll_time, 'size': self.elastic_search_scan_size } search = Search(using=self.elastic_client, index=self.index) search = search.query('match_all') search = search.params(**params) download_all_dicts_to_file(filename, search)
def search(self, index, doc_type, col_filters=None): # find whether we have a search alias for the given index if index in settings.ES_ROLLOVER: index = settings.ES_ROLLOVER[index]['search_index'] logger.debug("Searching index %s for doc_type %s and col_filters %s" % (index, doc_type, col_filters)) s = Search(using=self.client, index=index, doc_type=doc_type) if col_filters: for col_filter in col_filters: if isinstance(col_filter, ColumnFilter): s = s.filter(col_filter.query_type, **col_filter.query) else: raise ValueError('Column Filter is not an instance of' ' ColumnFilter class') s = s.params(size=MAX_NUMBER_DOCS) results = s.execute() logger.debug("Search returned %s records from elasticsearch." % len(results)) return [res.to_dict() for res in results]
class Elastic(LogProvider): def __init__(self, config_file='config.cfg'): super(Elastic, self).__init__() self.percentage=10.0 self.minimum_occurrences=250 # The ConfigParser documentation points out that there's no way to force defaults config option # outside the "DEFAULT" section. config = ConfigParser() config.read(config_file) if not config.has_section('elastic'): config.add_section('elastic') for option, value in {'use_ssl': 'True', 'host': '127.0.0.1', 'version': '2', 'index': 'nxapi', 'doc_type': 'events'}.items(): if not config.has_option('elastic', option): config.set('elastic', option, value) self.version = config.getint('elastic', 'version') self.index = config.get('elastic', 'index') use_ssl = config.getboolean('elastic', 'use_ssl') host = config.get('elastic', 'host') self.doc_type = config.get('elastic', 'doc_type') self.client = connections.create_connection(hosts=[host], use_ssl=use_ssl, index=self.index, version=self.version, doc_type=self.doc_type, timeout=30, retry_on_timeout=True ) Event.init(index=self.index) index = Index(self.index, using=self.client) index.doc_type(Event) self.initialize_search() def initialize_search(self): self.search = Search(using=self.client, index=self.index).extra(size=10000) def export_search(self): return self.search def import_search(self, search): self.search = search def get_filters(self): return self.search.to_dict() def add_filters(self, filters, regexp=False, negative=False): """ Add `filters` to the query. `filters is a dict of the form {'field': value, field2: value2}, but you can also use a list of values instead of a `str`. They'll be added as a _or_ (and not a _and_). :param dict filters: :param bool regexp: :param bool negative: :return: """ # We need to use multi_match, since we get the fields names dynamically. for key, value in filters.items(): if isinstance(value, set): value = list(value) # There is no need to process empty values. if not value: continue if isinstance(value, list): if negative: self.search = self.search.query(Q('bool', must_not=[ reduce(operator.or_, [Q('multi_match', query=v, fields=[key]) for v in value])]) ) else: self.search = self.search.query(Q('bool', must=[ reduce(operator.or_, [Q('multi_match', query=v, fields=[key]) for v in value])]) ) else: if negative: self.search = self.search.query(~Q("multi_match", query=value, fields=[key])) else: self.search = self.search.query(Q("multi_match", query=value, fields=[key])) def get_top(self, field, size=250): """ Get the top values for the given `field` :param str field: the field to filter on :param int size: how many top values to return, top :return dict of int: A structure of the form {value: number_of_hits, value2: numer_of_hits2} """ search = self.search ret = dict() if field in ['uri', 'vers', 'comments', 'server']: field = ''.join((field, '.raw')) if VERSION < (5, 0, 0): self.search = self.search.params(search_type='count', default_operator='AND') else: self.search = self.search.params(search_type='query_then_fetch') # This documented at https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search # search_type='count' has been deprecated in ES 2.0 self.search.aggs.bucket('TEST', 'terms', field=field) for hit in self.search.execute(ignore_cache=True).aggregations['TEST']['buckets']: ret[hit['key']] = hit['doc_count'] self.search = search return ret def get_relevant_ids(self, fields, percentage=0, minimum_occurrences=0): """ This function is supposed to return the id that are reparteed/present on the `fields`. :param list of str fields: :param float percentage: :param float minimum_occurrences: :return set of int: """ minimum_occurences = minimum_occurrences or self.minimum_occurrences percentage = percentage or self.percentage ret = set() search = self.search ids = set(i['id'] for i in self.search.execute()) # get all possible ID self.search = search for _id in ids: search = self.search self.add_filters({'id': _id}) # Get how many different fields there are for a given `id` data = collections.defaultdict(set) fields_counter = collections.defaultdict(int) for res in self.search.execute(): for field in fields: if res[field] not in data[field]: fields_counter[field] += 1.0 data[field].add(res[field]) # Ignore id that are present on less than 10% of different values of each fields for field, content in data.items(): if len(content) < minimum_occurrences: logging.debug('Discarding id \033[32m%s\033[0m only present %d times.', _id, len(content)) continue _percentage = len(content) / fields_counter[field] * 100.0 if _percentage > percentage: continue logging.debug('Discarding id \033[32m%s\033[0m present in %d%% of different values of the \033[32m%s\033[0m field', _id, _percentage, field) break else: ret.add(_id) self.search = search return ret def reset_filters(self): self.search = Search(using=self.client, index=self.index).extra(size=10000) def get_results(self): """ Return a `Result` object obtained from the execution of the search `self.search`. :return Result: The `Result` object obtained from the execution of the search `self.search`. """ search = self.search result = self.search.scan() self.search = search return result def commit(self): """Process list of dict (yes) and push them to DB """ self.total_objs += len(self.nlist) count = 0 def gen_events(events): dicts = list() for d in events: dicts.extend([{'index': {'_index': 'nxapi', '_type': 'events'}}, d.to_dict()]) yield dicts.pop(-2) yield dicts.pop(-1) events = list() for entry in self.nlist: event = Event(_index=self.index) for key, value in entry.items(): setattr(event, key, value) event.whitelisted = False event.comments = "import on"+str(datetime.datetime.now()) events.append(event) count += 1 try: ret = self.client.bulk(gen_events(events)) ## ToDo parse ret to selectively loop over events to events.save() whatever happens except TransportError as e: logging.warning("We encountered an error trying to continue.") for event in events: event.save(using=self.client) ## ToDo find a way to change the hardcoded 'events' for ES doctype ## elasticsearch_dsl Issue 689 self.total_commits += count logging.debug("Written "+str(self.total_commits)+" events") del self.nlist[0:len(self.nlist)]