def test_scan_iterates_through_all_docs(data_client): s = Search(index='git').filter('term', _type='commits') commits = list(s.scan()) assert 52 == len(commits) assert set(d['_id'] for d in DATA if d['_type'] == 'commits') == set(c.meta.id for c in commits)
def test_scan_iterates_through_all_docs(data_client): s = Search(index='flat-git') commits = list(s.scan()) assert 52 == len(commits) assert set(d['_id'] for d in FLAT_DATA) == set(c.meta.id for c in commits)
def update_sentiments(self): from watson_developer_cloud import ToneAnalyzerV3Beta tone_analyzer = ToneAnalyzerV3Beta(username='******', password='******', version='2016-02-11') client = connections.get_connection() search = Search(using=client, index='articles', doc_type='article') q = Q('bool', must=[Q('missing', field='watson_analyzed')]) search = search.query(q) counter = 0 for result in search.scan(): doc = Article.get(result.meta.id) try: analysis = tone_analyzer.tone(text=doc.body) tone_categories = analysis['document_tone']['tone_categories'] emotion_tones = list(filter(lambda x: x['category_id'] == 'emotion_tone', tone_categories))[0] doc.tone = {} for tone in emotion_tones['tones']: doc.tone[tone['tone_id']] = tone['score'] doc.watson_success = True except WatsonException: continue finally: doc.watson_analyzed = True doc.save() counter += 1 print(counter) if counter == 0: raise RealError()
def get_asset_names(self, start): s = Search(using='objects', index="objects-asset") \ .query('prefix', symbol__keyword=start) \ .source(['symbol']) s = s.params(clear_scroll=False) # Avoid calling DELETE on ReadOnly apis. asset_names = [ hit.symbol for hit in s.scan()] return asset_names
def getListFromFields(index, field): s = Search(using=es, index=index).query().source([field]) valuemap = {} for i in s.scan(): valuemap[i[field]] = 1 values = list(valuemap.keys()) values.sort() return values
def interval_ids(self, start, end): """Retrieve list of ids occurring between start and end.""" logger.debug('retrieving ids from %s to %s', start, end) q = Search(using=self.client, index=self.index) \ .fields(['']) \ .filter('exists', field=self.analysis_field) \ .filter('range', **{self.date_field: {'gt': start, 'lte': end}}) return [elem.meta.id for elem in q.scan()]
def search_longterm_topics(self): must_not = [Q('match', longterm_topic='0')] q = Q('bool', must_not=must_not) s = Search(using=self.client, index=self.news_index) \ .query(q) \ for hit in s.scan(): yield hit
def get_accounts(self, account_ids, size=1000): s = Search(using='objects', index="objects-account", extra={'size': size }) s = s.filter('terms', id=account_ids) s = s.source([ 'id', 'name', 'options.voting_account']) s = s.params(clear_scroll=False) # Avoid calling DELETE on ReadOnly apis. accounts = [hit.to_dict() for hit in s.scan()] return accounts
def get_asset_ids(self): s = Search(using='objects', index="objects-asset") \ .query('match_all') \ .source(['id']) s = s.params(clear_scroll=False) # Avoid calling DELETE on ReadOnly apis. asset_ids = [ hit.id for hit in s.scan()] return asset_ids
def _get_all_mbean_attr(self, connection, mbean, index, dsl_class='match'): s = Search(using=connection, index=index).\ query(dsl_class, ObjectName=mbean).\ sort({"@timestamp": {"order": 'desc'}}) response = [] for hit in s.scan(): response.append(self._get_attr_obj([hit], mbean)) return response
def mixpagedatain(scan, indexbase): s = Search(using=es, index=indexbase + '-pagedata').filter('terms', domain=[scan['domain']]) try: for i in s.scan(): scan['pagedata'] = i.data.to_dict() except IndexError: logging.error('could not find pagedata index for mixing pagedata in') return scan
def get_articles_from_query(query, index, labels, type_val, fields, to_csv=False): """ Get the articles corresponding to the query from index query: (str) searched expression index: (str) fields: (str or list) for highlight fields :return: (list) [{title: (str), text: (list)}] """ if isinstance(fields, str): fields = [fields] # search_dict = { # "_source": ["title", "abstract", "publication_date"], # "query": { # "bool": { # "must": [ # { # "multi_match": { # "query": f"{query} {labels}", # "operator": "and", # "fields": fields # } # } # ] # # } # } # } search_dict = { "_source": ["title", "abstract", "publication_date"], "query": { "bool": { "must": [{ "match": { "abstract_clean": f"{query}" } }, { "match": { "abstract_clean": f"{labels}" } }] } } } search = Search().from_dict(search_dict).index(index) search.execute() hits = [{**hit.to_dict(), **{"_id": hit.meta.id}} for hit in search.scan()] return hits
def get(self): pages = Counter() results = [] # GET A LIST OF ALL THE WEBSITE'S PAGES AND THEIR LAST MODIFIED DATE all_pages = {} url = "http://pagecloud.com/" manifest = requests.get(url + 'manifest.json') manifest = manifest.json() for i in range(len(manifest['pages'])): all_pages[manifest['pages'][i]['name']] = manifest['pages'][i]['lastModified'] # e.g. www.domain.com/page/ <-- 'request' provides you with '/page' s = Search(using=client, index='production-logs-*') \ .fields(['request']) \ .query('match_all') for hit in s.scan(): response = hit.to_dict() p = response.get('request', [''])[0] # Sanitize page name format if re.search('\?', p) != None: match = re.search('(.*)\?', p) p = match.group(1) pages[p] += 1 for page in pages.keys(): # Sanitize page name format (remove all parameters after '?') to find modifiedDate cleanPage = page if re.search('\?', page) != None: match = re.search('(.*)\?', page) cleanPage = match.group(1) if cleanPage[1:] in all_pages.keys(): lm = all_pages[cleanPage[1:]] elif cleanPage == '': lm = all_pages['home'] else: lm = 0 # page could not be found in manifest list (might be referrer link!) if lm > 0: lm = datetime.datetime.fromtimestamp(lm / 1000).strftime("%Y-%m-%d")#T%H:%M:%S") results.append({ 'name': page, 'hits': pages[page], 'lastModified': lm }) return { 'data': { 'pages': results } }
def paper_info_cache_query( paper_ids, batch_size=DEFAULT_BATCH, query_filter=None): """ Gets paper info from cache. """ start = datetime.now() # Query results complete_info = list() partial_info = list() seen = set() # Query for paper info paper_info_s = Search(index = 'paper_info', using = client) paper_info_s = paper_info_s.filter('terms', _id = paper_ids) paper_info_s = paper_info_s.params(size=DEFAULT_BATCH) if query_filter is not None: paper_info_s = paper_info_s.query(query_filter) # Convert query into dictionary format for paper_info in paper_info_s.scan(): paper_info_res = paper_info.to_dict() # Remove the creation date for query field_del(paper_info_res, 'CreatedDate') # Check the type of the result if 'FieldsOfStudy' not in paper_info_res: continue if paper_info_res['cache_type'] == 'partial': # if paper_info_res['cache_type'] == 'partial': partial_info.append(paper_info_res) else: skip = False for ref in paper_info_res['References']: if 'FieldsOfStudy' not in ref: skip = True continue for cit in paper_info_res['Citations']: if 'FieldsOfStudy' not in cit: skip = True continue if skip: continue complete_info.append(paper_info_res) del paper_info_res['cache_type'] # Add to seen set seen.add(paper_info_res['PaperId']) print(batch_size, datetime.now() - start) # Check for no results and return return {'complete': complete_info, 'partial': partial_info, 'missing': set(paper_ids) - seen}
def pr_links_query(paper_ids): ''' Get properties of a paper. ''' # Targets pr_targets = ['PaperId', 'PaperReferenceId', 'FieldOfStudyId'] # Query results references = list() citations = list() fieldsofstudy = list() # Result dictionary results = dict() for paper_id in paper_ids: results[paper_id] = {'References': [], 'Citations': [], 'FieldsOfStudy': []} # Query for paper references ref_s = Search(index = 'paperreferences', using = client) ref_s = ref_s.query('terms', PaperId=paper_ids) ref_s = ref_s.params(request_timeout=TIMEOUT) # Convert into dictionary format for ref_info in ref_s.scan(): results[ref_info[pr_targets[0]]]['References'].append(ref_info[pr_targets[1]]) # Query for paper citations cit_s = Search(index = 'paperreferences', using = client) cit_s = cit_s.query('terms', PaperReferenceId=paper_ids) cit_s = cit_s.params(request_timeout=TIMEOUT) # Convert into dictionary format for cit_info in cit_s.scan(): results[cit_info[pr_targets[1]]]['Citations'].append(cit_info[pr_targets[0]]) # Query for paper fields of study fos_s = Search(index = 'paperfieldsofstudy', using = client) fos_s = fos_s.query('terms', PaperId=paper_ids) fos_s = fos_s.params(request_timeout=TIMEOUT) # Convert into dictionary format for fos_info in fos_s.scan(): results[fos_info[pr_targets[0]]]['FieldsOfStudy'].append(fos_info[pr_targets[2]]) # Return results as a dictionary return results
def get_fork_events(followers, repos, start_date, end_date): client = get_client() s = Search(using=client, index="events") \ .query("match", type="ForkEvent") \ .filter('range', created_at={'gte': start_date, 'lte': end_date}) \ .filter('terms', actor__login_h__keyword=followers) \ .filter('terms', repo__id_h__keyword=repos) response = s.scan() return response
def subscribe(self): while True: logger.info(self.last_id) s = Search(index='intrical').query('match_all')\ .filter('range', id={'gt': self.last_id + 1, 'lt': self.last_id + self.page_size}) for i, doc in enumerate(s.scan()): yield i, doc self.last_id = int(doc.meta.id) self.last_id += self.page_size
def numObjects(self): """Return the number of indexed objects.""" es = get_query_client() search = Search(using=es, index=index_name()) try: return len(list(search.scan())) except Exception: logger.exception('ElasticSearch "count" query failed') return 'Problem getting all documents count from ElasticSearch!'
def get_repos_forked_between(start_date, end_date): client = get_client() s = Search(using=client, index="events") \ .query("match", type="ForkEvent") \ .filter('range', created_at={'gte': start_date, 'lte': end_date}) \ .sort("created_at") response = s.scan() repos = [hit.to_dict()['repo']['id_h'] for hit in response] return repos
def paper_name_query(paper_ids): if not paper_ids: return [] target = 'PaperTitle' paper_s = Search(index='papers', using=client) \ .query('terms', PaperId=paper_ids) \ .source([target]) \ .params(request_timeout=30) return list(map(itemgetter(target), paper_s.scan()))
def test(): es_conn = ESConnection() s = Search(using=es_conn, index='git') s.execute() for item in s.scan(): print(item) break
def load_snps_by_region(chrom, start, end): """Retrieve snp information by region""" index = _get_index_from_chr(chrom) search_snps = Search().using(es).doc_type('snps').index(index).filter( "range", position={ "lte": end, "gte": start }) return {snp.position: snp.to_dict() for snp in search_snps.scan()}
def get_fulldata(): """ API to get all data from index :return: json with all data """ s = Search(using=es, index="twitter_india_covid") df = pd.DataFrame([hit.to_dict() for hit in s.scan()]) responses = df.to_json(orient="records") return responses
def scan_pivot(self, pivot_lang, langs): index = TMUtils.lang2es_index(pivot_lang) if not self.index_exists(index): return search = Search(using=self.es, index=index) for lang in langs: search = search.query('match', target_language=lang) for result in search.scan(): yield result.meta.id
def pfos_prop_query(paper_ids): ''' Get properties of a paper. ''' # Elastic search client client = Elasticsearch(conf.get("elasticsearch.hostname")) # Targets pfos_targets = ['PaperId', 'FieldOfStudyId'] # Query for paper affiliation pfos_s = Search(index='paperfieldsofstudy', using=client) pfos_s = pfos_s.query('terms', PaperId=paper_ids) pfos_s = pfos_s.source(pfos_targets) pfos_s = pfos_s.params(request_timeout=TIMEOUT) # Convert paa into dictionary format results = dict() fos_ids = set() for pfos in pfos_s.scan(): pfos_res = pfos.to_dict() # Get fields paper_id = pfos_res['PaperId'] del pfos_res['PaperId'] # Author if 'FieldOfStudyId' in pfos_res: fos_ids.add(pfos_res['FieldOfStudyId']) # Aggregate results if paper_id in results: results[paper_id].append(pfos_res) else: results[paper_id] = [pfos_res] fos_names, fos_levels = fos_name_level_dict_query(list(fos_ids)) res = dict() for p_id, pfos_info_list in results.items(): pfos_res = list() for pfos_info in pfos_info_list: if 'FieldOfStudyId' in pfos_info: if pfos_info['FieldOfStudyId'] in fos_names: pfos_info['FieldOfStudyName'] = fos_names[ pfos_info['FieldOfStudyId']] pfos_info['FieldOfStudyLevel'] = fos_levels[ pfos_info['FieldOfStudyId']] else: continue pfos_res.append(pfos_info) res[p_id] = pfos_res # Return as dictionary return res
def run_query_fatcat(query: str, fulltext_only: bool, json_output: Any) -> None: """ Queries fatcat search index (the full regular fatcat.wiki release index) for search string passed (and some filters), iterates over the result set (using scroll), and fetches full release entity (via api.fatcat.wik) for each. TODO: group by work_id """ api_session = requests_retry_session() es_backend = os.environ.get( "ELASTICSEARCH_FATCAT_BASE", "https://search.fatcat.wiki" ) es_index = os.environ.get("ELASTICSEARCH_FATCAT_RELEASE_INDEX", "fatcat_release") es_client = elasticsearch.Elasticsearch(es_backend) search = Search(using=es_client, index=es_index) search = search.exclude("terms", release_type=["stub", "component", "abstract"]) # "Emerald Expert Briefings" search = search.exclude("terms", container_id=["fnllqvywjbec5eumrbavqipfym"]) # ResearchGate search = search.exclude("terms", doi_prefix=["10.13140"]) if fulltext_only: search = search.filter("terms", in_ia=True) search = search.query( Q("query_string", query=query, default_operator="AND", fields=["biblio"]) ) print(f"Expecting {search.count()} search hits", file=sys.stderr) search = search.params(clear_scroll=False) search = search.params(_source=False) results = search.scan() for hit in results: release_id = hit.meta.id resp = api_session.get( f"https://api.fatcat.wiki/v0/release/{release_id}", params={ "expand": "container,files,filesets,webcaptures", "hide": "references", }, ) resp.raise_for_status() row = dict( fatcat_hit=hit.meta._d_, release_id=release_id, fatcat_release=resp.json(), ) print(json.dumps(row, sort_keys=True), file=json_output)
def ESsearch(pkgname): es = createESConnection() query = Search(index='t2').using(es).query("term", product=pkgname) res = query.scan() cvelist = [] for hit in res: cveitem = cve(hit) cvelist.append(cveitem) # print cvelist return cvelist
def indexDump(indexName, sortField="@timestamp"): definedSearch = Search(index=indexName).sort( {sortField: { "order": "desc" }}) indexData = definedSearch.scan() # for doc in indexData['hits']['hits']: # print("%s) %s" % (doc['_id'], doc['_source'])) return indexData
def load_genes_by_region(chrom, start, end, features): """Retrieve genes by region""" index = _get_index_from_chr(chrom) search_genes = Search().using(es).doc_type('genes').index(index).filter("range", positions={"lte": end, "gte":start}) if not features: search_genes.source(exclude=['isoforms']) genes = [gene.to_dict() for gene in search_genes.scan() ] for gene in genes: gene['ko_associations'] = load_gene_ko_associations(gene['name'], return_only_significant=True) return genes
def calculateJenks(auth, args, es): #Build the view string from arguments view = "dsra_{eq_scenario}_{retrofit_prefix}_{dbview}".format(**{'eq_scenario':args.eqScenario, 'retrofit_prefix':args.retrofitPrefix, 'dbview':args.dbview}) response = Search(using=es, index=view) #Create a dataframe containing the full series of values from the specified view and field df = pd.DataFrame([getattr(hit.properties, args.field) for hit in response.scan()], columns=[args.field]) #Use Jenskpy to create natural breaks breaks = jenkspy.jenks_breaks(df[args.field], nb_class=args.bins) return breaks
def es_search_papers_from_confid(confid, papercnt): s = Search(using=client, index="papers") \ .query("match", ConferenceSeriesId=confid) s = s.params(preserve_order=True) data = [] for position, hit in enumerate(s.scan()): if position == papercnt: break data.append(hit.to_dict()) return data
def entity_search(self,logfile): """ Search and count entities: to optimize and find count from indexer """ print("Entity count is running .....") logfile.write("Entity count is running ..... \n") logfile.write("============================================== \n") es = Elasticsearch(timeout=300) k = 0 for entity_rep in self.entity_dict: for entity in self.entity_dict[entity_rep]: entity_space_sep = entity.replace("_", " ") s = Search(using=es, index="pubmed")\ .params(request_timeout=300)\ .query("match_phrase", abstract=entity_space_sep) num_hits = 0 num_valid_hits = 0 num_counts = 0 for hit in s.scan(): num_hits += 1 cur_pmid = str(hit.pmid) if cur_pmid not in self.concerned_pmid_set: continue abs_lower = hit.abstract.lower().replace("-", " ") entity_lower = entity_space_sep.lower().replace("-", " ") entity_cnt = abs_lower.count(entity_lower) if entity_cnt == 0: continue else: self.entity_count_per_pmid[cur_pmid][entity_rep] += entity_cnt num_valid_hits += 1 num_counts += entity_cnt logfile.write(str(entity) + "# hits:" + str(num_hits)+\ "# valid hits:" + str( num_valid_hits) +\ "# counts:"+ str(num_counts)) logfile.write("\n") k = k +1 if k%10 == 0: print(k,'entity successfully counted!') logfile.write(str(k) + "entity successfully counted!") logfile.write("\n")
def search_graphs1(request, owner_email=None, names=None, nodes=None, edges=None, tags=None, member_email=None, is_public=None, query=None, limit=20, offset=0, order='desc', sort='name'): sort_attr = getattr(db.Graph, sort if sort is not None else 'name') orber_by = getattr(db, order if order is not None else 'desc')(sort_attr) is_public = int(is_public) if is_public is not None else None if member_email is not None: member_user = users.controllers.get_user(request, member_email) if member_user is not None: group_ids = [ group.id for group in users.controllers.get_groups_by_member_id( request, member_user.id) ] else: raise Exception("User with given member_email doesnt exist.") else: group_ids = None if edges is not None: edges = [tuple(edge.split(':')) for edge in edges] if 'query' in query: s = Search(using=settings.ELASTIC_CLIENT, index='graphs') s.update_from_dict(query) s.source(False) graph_ids = [int(hit.meta.id) for hit in s.scan()] else: graph_ids = None total, graphs_list = db.find_graphs(request.db_session, owner_email=owner_email, graph_ids=graph_ids, is_public=is_public, group_ids=group_ids, names=names, nodes=nodes, edges=edges, tags=tags, limit=limit, offset=offset, order_by=orber_by) return total, graphs_list
def search_index(bb): '''Retrieves the location names from the elastic index using the given bounding box''' if connection_string == '' or index_name == '': print("\n###########################################################") print("Global ERROR: Elastic host and port or index name not defined") print( "#############################################################\n") exit() if not geo_calculations.is_bb_acceptable( bb) or bb[0] > bb[2] or bb[1] > bb[3]: print("\n##########################################################") print("Global ERROR: Bounding Box is too big, choose a smaller one!") print("############################################################\n") exit() connections.create_connection(hosts=[connection_string], timeout=60) query = { "bool": { "must": { "match_all": {} }, "filter": { "geo_bounding_box": { "coordinate": { "bottom_left": { "lat": bb[0], "lon": bb[1] }, "top_right": { "lat": bb[2], "lon": bb[3] } } } } } } phrase_search = [Q(query)] #to search with a scroll e_search = Search(index=index_name).query(Q('bool', must=phrase_search)) try: res = e_search.scan() except BaseException: raise return res
def _queryElasticsearch(self, from_date, to_date, query): logging.debug("Connecting to ES") client = Elasticsearch() logging.debug("Beginning search") s = Search(using=client, index=self._config['ElasticSearch']['raw_index']) s = s.filter('range', **{'EndTime': {'from': from_date, 'to': to_date }}) logging.debug("About to execute query:\n%s" % str(s.to_dict())) for hit in s.scan(): yield hit
def check_user_in_db(es, user): search_q = Search(using=es, index='expense_manager', doc_type='user') \ .query('match', username=user) result = search_q.scan() res = [] for user_found in result: if isinstance(user_found.actual_username, utils.AttrList): res.extend(user_found.actual_username) else: res.append(user_found.actual_username) print(res) return set(res)
def get(self): results = [] clients = Counter() for req in _requests: ip = req.get('clientip', ['-'])[0] clients[ip] += 1 freqPath = Counter() for visitor in clients.keys()[:100]: pages =[""] s = Search(using=client, index='production-logs-*') \ .fields(['clientip', 'request']) \ .query('match', clientip=visitor) for page in s.scan(): page = page.to_dict().get('request', [''])[0] print page if not ((page.find('.') > -1) or (page == pages[len(pages) - 1])): print "true" if page == "": page = "/" pages.append(page) if len(pages) > 2: for x in range(0, len(pages)-1): freqPath[str(pages[x]+ " "+ pages[x+1])]+=1 sorted = freqPath.most_common() commonTrace =[] for elem in sorted: x,y = elem commonTrace.append(x.split(" ")) soln = self.commonPath(commonTrace) rank = 1 data = [] for elem in soln: data.append({ 'nodes': elem, 'count': rank }) rank+=1 return { 'data': { 'path': data } }
def get(self): results = [] clients = Counter() for req in _requests: ip = req.get('clientip', ['-'])[0] clients[ip] += 1 path =[] for visitor in clients.keys()[:100]: pages =[""] s = Search(using=client, index='production-logs-*') \ .fields(['clientip', 'request']) \ .query('match', clientip=visitor) for page in s.scan(): page = page.to_dict().get('request', [''])[0] print page if not ((page.find('.') > -1) or (page == pages[len(pages) - 1])): print "true" pages.append(page) if len(pages) > 2: path.append(pages) freqPath = Counter() for elem in path: string = ' '.join(elem) freqPath[string] += 1 data = [] for elem in freqPath.keys(): data.append({ 'nodes': elem.split(" ")[1:-1], 'count': freqPath[elem] }) return { 'data': { 'path': data } }
def search_graphs1(request, owner_email=None, names=None, nodes=None, edges=None, tags=None, member_email=None, is_public=None, query=None, limit=20, offset=0, order='desc', sort='name'): sort_attr = getattr(db.Graph, sort if sort is not None else 'name') orber_by = getattr(db, order if order is not None else 'desc')(sort_attr) is_public = int(is_public) if is_public is not None else None if member_email is not None: member_user = users.controllers.get_user(request, member_email) if member_user is not None: group_ids = [group.id for group in users.controllers.get_groups_by_member_id(request, member_user.id)] else: raise Exception("User with given member_email doesnt exist.") else: group_ids = None if edges is not None: edges = [tuple(edge.split(':')) for edge in edges] if 'query' in query: s = Search(using=settings.ELASTIC_CLIENT, index='graphs') s.update_from_dict(query) s.source(False) graph_ids = [int(hit.meta.id) for hit in s.scan()] else: graph_ids = None total, graphs_list = db.find_graphs(request.db_session, owner_email=owner_email, graph_ids=graph_ids, is_public=is_public, group_ids=group_ids, names=names, nodes=nodes, edges=edges, tags=tags, limit=limit, offset=offset, order_by=orber_by) return total, graphs_list
def generate(self): client=Elasticsearch() results=[] common_name = self.config.get("query", "%s_commonname" % (self.vo.lower())) wildcardvoq = '*'+self.vo.lower()+'*' wildcardcommonnameq ='*'+common_name+'*' starttimeq = self.start_time.replace('/','-').replace(' ','T') endtimeq = self.end_time.replace('/','-').replace(' ','T') querystringverbose = '{"bool":{"must":[{"wildcard":{"VOName":"%s"}},{"wildcard":{"CommonName":"%s"}}],"filter":[{"term":{"Resource.ResourceType":"BatchPilot"}},{"range":{"EndTime":{"gte": "%s","lt":"%s"}}}]}}' % (wildcardvoq,wildcardcommonnameq,starttimeq,endtimeq) resultset = Search(using=client,index='gracc-osg-2016*') \ .query("wildcard",VOName=wildcardvoq)\ .query("wildcard",CommonName=wildcardcommonnameq)\ .filter("range",EndTime={"gte":starttimeq,"lt":endtimeq})\ .filter(Q({"term":{"Resource.ResourceType":"BatchPilot"}})) response = resultset.execute() return_code = response.success() #True if the elasticsearch query completed without errors for hit in resultset.scan(): try: globaljobid = hit['GlobalJobId'][0] jobid = globaljobid.split('#')[1]+'@'+globaljobid[globaljobid.find('.')+1:globaljobid.find('#')] outstr= "%s\t%s\t%s\t%s\t%s\t%s" % (hit['StartTime'][0],\ hit['EndTime'][0],\ jobid,\ hit['Host']['description'][0],\ hit['Host']['value'][0],\ hit['Resource']['ExitCode'] ) results.append(outstr) if self.verbose: print >> sys.stdout, outstr except KeyError as e: pass #Figure this out #mysql_client_cfg = MySQLUtils.createClientConfig("main_db", self.config) #self.connectStr = MySQLUtils.getDbConnection("main_db", mysql_client_cfg, self.config) #common_name = self.config.get("query", "%s_commonname" % (self.vo.lower())) #select = "select StartTime, EndTime, CONCAT(substring_index(substring(GlobalJobId, 28), '#', 1), '@', " + \ # "substring_index(substring(GlobalJobId, 8), '#', 1)), HostDescription, substring_index(Host," + \ # "' ', 1), r.Value as Status from JobUsageRecord j, Resource r where r.dbid = j.dbid and" + \ # " r.Description = 'ExitCode' and EndTime>= '" + self.start_time + "' and EndTime < '" + \ # self.end_time + "' and ResourceType = 'BatchPilot' and CommonName like '%" + common_name + \ # "%' and VOName like '%" + self.vo.lower() + "%' order by HostDescription, Host, GlobalJobId, r.Value;" if self.verbose: print >> sys.stdout, querystringverbose #results, return_code = MySQLUtils.RunQuery(select, self.connectStr) if not return_code: raise Exception('Error to access mysql database') #Replaced with print statement in resultset.scan() loop #if self.verbose: # print >> sys.stdout, results if len(results) == 1 and len(results[0].strip()) == 0: print >> sys.stdout, "Nothing to report" return for line in results: tmp = line.split('\t') start_time = tmp[0].strip().replace('T',' ').replace('Z','') end_time = tmp[1].strip().replace('T',' ').replace('Z','') jobid = tmp[2].strip() site = tmp[3].strip() if site == "NULL": continue host = tmp[4].strip() status = int(tmp[5].strip()) job = Job(end_time, start_time, jobid, site, host, status) self.run.add_job(site, job) clusterid = jobid.split(".")[0] if not self.clusters.has_key(clusterid): self.clusters[clusterid] = [] self.clusters[clusterid].append(job)
def analyze_git(es_read, es_write, es_read_index, es_write_index, key): # Retrieve projects information projects = openstack_projects() projects["repository"] = projects["urls"] projects = projects.drop("urls", 1) # Retrieve uuids info uuids = Uuid(pandas.DataFrame(), file_path='openstack_uuids.csv') # Retrieve gender info enriched_gender = Gender(pandas.DataFrame(), key, "git_gender.csv") es_write.indices.delete(es_write_index, ignore=[400, 404]) es_write.indices.create(es_write_index, body=MAPPING_GIT) s = Search(using=es_read, index=es_read_index) s.execute() commits = [] cont = 1 uniq_id = 1 first = True for item in s.scan(): commits.append(item.to_dict()) if cont % 15000 == 0: git_events = events.Git(commits) events_df = git_events.eventize(2) print (cont) print (len(events_df)) # Filter information data_filtered = FilterRows(events_df) events_df = data_filtered.filter_(["filepath"], "-") # Add filetype info enriched_filetype = FileType(events_df) events_df = enriched_filetype.enrich('filepath') enriched_gender.data = events_df events_df = enriched_gender.enrich("owner") aux = pandas.DataFrame(events_df["committer"], columns=["committer"]) enriched_gender.data = aux aux = enriched_gender.enrich("committer") events_df["committer_gender"] = aux["gender"] events_df["committer_gender_probability"] = aux["gender_probability"] events_df["committer_gender_analyzed_name"] = aux["gender_analyzed_name"] events_df["committer_gender_count"] = aux["gender_count"] splitemail = SplitEmail(events_df) events_df = splitemail.enrich("owner") # Add author uuid uuids.data = events_df events_df = uuids.enrich(['user', 'email']) print (len(events_df)) # Add projects information events_df = pandas.merge(events_df, projects, how='left', on='repository') # Fill NaN projects events_df.fillna('notavailable', inplace=True) # Deal with surrogates convert = ToUTF8(events_df) events_df = convert.enrich(["gender_analyzed_name", "committer_gender_analyzed_name", "owner", "committer", "user", "username"]) print (len(events_df)) commits = [] uniq_id = upload_data(events_df, es_write_index, es_write, uniq_id) cont = cont + 1 upload_data(events_df, es_write_index, es_write, uniq_id)
def rank(self, event): print "start rank" color = ['black', 'blue'] # print self.state.get() self.output.delete("1.0", "end") arg1 = self.entry1.get() arg2 = self.entry2.get() context = self.entry3.get() self.norm = float(self.entry3.get()) sep_context = context.split() context_id = {} self.RelInstance = RelationSet(patternset = self.patternset, relationlist = self.relationlist) s = Search(using = self.es) q = Q("match", pentity1 = {"query":arg1, "operator":"and"}) & Q("match", pentity2 = {"query":arg2, "operator":"and"}) s = s.query(q) cnt = 0 if self.rank_method.get() == 'new': try: type1 = self.entity_type[arg1] type2 = self.entity_type[arg2] prior = self.prior[type1 + '-' + type2] except: print "cannot find type" return if self.model == '': for hits in s.scan(): # if joinRelation(hits.relation) in self.model: if cnt%500 == 0: print cnt cnt += 1 self.RelInstance.update_relation(hits.relation, hits.confidence) self.RelInstance.update_context_constt(hits.relation, hits.sentence, hits.phrase, hits.offset) self.RelInstance.update_modifier(arg1, arg2, hits.relation, hits.head, hits.role, hits.offset, hits.postag, hits.sentence) else: for hits in s.scan(): if joinRelation(hits.relation) in self.model: if cnt%500 == 0: print cnt cnt += 1 self.RelInstance.update_relation(hits.relation, hits.confidence) self.RelInstance.update_context_constt(hits.relation, hits.sentence, hits.phrase, hits.offset) self.RelInstance.update_modifier(arg1, arg2, hits.relation, hits.head, hits.role, hits.offset, hits.postag, hits.sentence) if self.rank_method.get() == 'new': print "update score" self.RelInstance.new_score(prior, self.norm, self.info) if self.state.get() == 'vanilla': print "vanilla" print "there are %d instances" % cnt length = len(self.RelInstance.storage) print "There are %d types of relations" % length sorted_list = self.RelInstance.sort_score(order = True) for ele in sorted_list: # context = RelInstance.storage[ele].context ct_entity_cnt = self.RelInstance.storage[ele].ct_entity_cnt modifier1 = self.RelInstance.storage[ele].modifier1.most_common(1) modifier2 = self.RelInstance.storage[ele].modifier2.most_common(1) if len(modifier1) == 0: modifier1 = [['','']] if len(modifier2) == 0: modifier2 = [['','']] name1 = ' '.join(modifier1[0][0].split('@')[::-1]) name2 = ' '.join(modifier2[0][0].split('@')[::-1]) self.output.insert("end", ele + ' ' + str(self.RelInstance.storage[ele].score) + ' ', "black") self.output.insert("end", '(' + name1 + ', ' + str(modifier1[0][1]) + ')' + ' ' + '(' + name2 + ', ' + str(modifier2[0][1]) + ')' + '\n', "red") for entity_ele in ct_entity_cnt.most_common(10): self.output.insert("end", str(entity_ele) + ' ', "green") self.output.insert("end", '\n\n\n', "black") elif (self.state.get() == 'DBSCAN' or self.state.get() == 'Kmeans'): input_eps = float(self.eps.get()) input_min_samples = int(self.min_samples.get()) print "there are %d instances" % cnt Items = self.RelInstance.get_scoreitems() Weight = self.RelInstance.get_scores() Weight = np.asarray(Weight) feature = np.empty((0,400)) for triple in Items: feature = np.vstack([feature, self.model[joinRelation(triple[0])]]) if self.state.get() == 'DBSCAN': handle = DBSCAN(eps = input_eps, min_samples = input_min_samples) handle.fit(feature, sample_weight = Weight) n_clusters = len(set(handle.labels_)) print "there are %d clusters" % n_clusters else: handle = KMeans(n_clusters=self.num_cluster.get(), init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1) handle.fit(feature) n_clusters = self.num_cluster.get() cluster_cnt = [0] * n_clusters core_sample = [0] * n_clusters noise = [] if self.state.get() == 'DBSCAN': for i in range(n_clusters): core_sample[i] = [] for i in handle.core_sample_indices_: core_sample[handle.labels_[i]].append(Items[i]) else: cluster_content = [Counter()] for j in range(n_clusters-1): cluster_content.append(Counter()) for k in range(len(Items)): cluster_content[handle.labels_[k]][Items[k][0]] += Items[k][1] for j in range(len(handle.labels_)): if handle.labels_[j] == -1: noise.append(Items[j]) else: # if Weight[j] > 2: cluster_cnt[handle.labels_[j]] += Weight[j] sorted_cluster = np.argsort(cluster_cnt)[::-1] clusterid=0; if self.state.get() == "DBSCAN": for cluster_indices in sorted_cluster: rprent = core_sample[cluster_indices] sorted_rprent = sorted(rprent, key = itemgetter(1), reverse = True) for ele in sorted_rprent: self.output.insert("end", ele[0] + ' ' + str(ele[1]) + ' ', color[clusterid%2]) # self.output.insert("end", ele[0], color[clusterid%3]) ct_entity_cnt = self.RelInstance.storage[ele[0]].ct_entity_cnt ct_entity_cnt = self.RelInstance.storage[ele[0]].ct_entity_cnt modifier1 = self.RelInstance.storage[ele[0]].modifier1.most_common(1) modifier2 = self.RelInstance.storage[ele[0]].modifier2.most_common(1) if len(modifier1) == 0: modifier1 = [['','']] if len(modifier2) == 0: modifier2 = [['','']] name1 = ' '.join(modifier1[0][0].split('@')[::-1]) name2 = ' '.join(modifier2[0][0].split('@')[::-1]) self.output.insert("end", '(' + name1 + ', ' + str(modifier1[0][1]) + ')' + ' ' + '(' + name2 + ', ' + str(modifier2[0][1]) + ')' + '\n', "red") for entity_ele in ct_entity_cnt.most_common(10): self.output.insert("end", str(entity_ele) + ' ', "green") self.output.insert("end", '\n', "black") clusterid += 1 for noise_ele in noise: self.output.insert("end", noise_ele[0] + ' ' + str(noise_ele[1]) + ' NOISE\n', "black") else: for m in sorted_cluster: dic = cluster_content[m] sorted_list = dic.most_common(10) for ele in sorted_list: self.output.insert("end", ele[0] + ' ' + str(ele[1]) + '\n', color[clusterid%3]) clusterid += 1
s = query_missingparam(s, args.parameter, args.method, args.responsecode, args.invert) querytype = QUERY_SEARCH elif args.cmd == "headervalues": s = query_headervals(s, args.header) querytype = QUERY_VALUES elif args.cmd == "search": s = query(s, " ".join(args.query)) querytype = QUERY_SEARCH else: argparser.print_help() sys.exit(1) if querytype == QUERY_SEARCH: if args.fields: print_debug(s.to_dict()) r = s.scan() else: add_default_aggregation(s) print_debug(s.to_dict()) r = s.execute() elif querytype == QUERY_VALUES: print_debug(s.to_dict()) r = s.execute() if querytype == QUERY_SEARCH: if not r: print("No matches!") sys.exit(0) if args.fields: for d in r: print(d['request']['url'])
def generate(self): client=Elasticsearch() results=[] resultset = Search(using=client,index='gracc-osg-2016*').query( { "bool":{ "must":[ {"wildcard":{"VOName":"*uboone*"}}, {"wildcard":{"CommonName":"*uboonegpvm01.fnal.gov"}} ], "filter":[ {"term":{"Resource.ResourceType":"BatchPilot"}}, {"range":{ "EndTime":{ "gte": "2016-05-15T07:30", "lt":"2016-05-16T07:30" } }} ] } } ) for hit in resultset.scan(): try: globaljobid = hit['GlobalJobId'][0] jobid = globaljobid.split('#')[1]+'@'+globaljobid[globaljobid.find('.')+1:globaljobid.find('#')] outstr= "%s\t%s\t%s\t%s\t%s\t%s" % (hit['StartTime'][0],\ hit['EndTime'][0],\ jobid,\ hit['Host']['description'][0],\ hit['Host']['value'][0],\ hit['Resource']['ExitCode'][0] ) results.append(outstr) except KeyError as e: pass # mysql_client_cfg = MySQLUtils.createClientConfig("main_db", self.config) # self.connectStr = MySQLUtils.getDbConnection("main_db", mysql_client_cfg, self.config) # common_name = self.config.get("query", "%s_commonname" % (self.vo.lower())) # select = "select StartTime, EndTime, CONCAT(substring_index(substring(GlobalJobId, 28), '#', 1), '@', " + \ # "substring_index(substring(GlobalJobId, 8), '#', 1)), HostDescription, substring_index(Host," + \ # "' ', 1), r.Value as Status from JobUsageRecord j, Resource r where r.dbid = j.dbid and" + \ # " r.Description = 'ExitCode' and EndTime>= '" + self.start_time + "' and EndTime < '" + \ # self.end_time + "' and ResourceType = 'BatchPilot' and CommonName like '%" + common_name + \ # "%' and VOName like '%" + self.vo.lower() + "%' order by HostDescription, Host, GlobalJobId, r.Value;" if self.verbose: print >> sys.stdout, "WHAT IS q?" #results=query #this is a placeholder # results, return_code = MySQLUtils.RunQuery(select, self.connectStr) # if return_code != 0: # raise Exception('Error to access mysql database') if self.verbose: print >> sys.stdout, "QUERY STATEMENT HERE. THIS IS WHERE select USED TO GO" if len(results) == 1 and len(results[0].strip()) == 0: print >> sys.stdout, "Nothing to report" return i=0 for line in results: tmp = line.split('\t') start_time = tmp[0].strip() end_time = tmp[1].strip() jobid = tmp[2].strip() site = tmp[3].strip() if site == "NULL": continue host = tmp[4].strip() status = int(tmp[5].strip()) job = Job(end_time, start_time, jobid, site, host, status) self.run.add_job(site, job) clusterid = jobid.split(".")[0] if not self.clusters.has_key(clusterid): self.clusters[clusterid] = [] self.clusters[clusterid].append(job) i+=1 print i
def fetch_and_clean_up(index_name): """ Fetch Elastic data and clean it up """ # Logstash and HDFS general info output_dir = uf.mkdir_if_not_exist("/tmp/exstreamly_cheap_files/elasticsearch_cleanup") # logstash_file = os.path.join(output_dir, 'clean_deals.json') # HDFS Related data group = "deals_data_hdfs" topic_id = "elastic_deals_data" timestamp = time.strftime("%Y%m%d%H%M%S") hadoop_file = os.path.join(output_dir, "hdfs_{}.dat".format(timestamp)) hadoop_path = "/exstreamly_cheap_main_files/all_deals/history" cached_path = "/exstreamly_cheap_main_files/all_deals/cached" hadoop_fullpath = "{}/{}_{}_{}.dat".format(hadoop_path, group, topic_id, timestamp) cached_fullpath = "{}/{}_{}_{}.dat".format(cached_path, group, topic_id, timestamp) uf.print_out("Writing the logs to {} which will be pushed to hdfs and S3".format(hadoop_file)) block_cnt = 0 client = make_client() cc = Search(using=client, index=index_name) gen = cc.scan() config = configparser.SafeConfigParser() config.read("../../config/general.conf") config_params = uf.get_config_items(config, settings.PRODUCER_CLEAN_ES_DATA) try: kafka_hosts = config_params["kafka_hosts"] topic = config_params["topic"] group = config_params["group"] zk_hosts = config_params["zookeeper_hosts"] except KeyError: raise kafka_client = KafkaClient(hosts=kafka_hosts) kafka_topic = kafka_client.topics[topic] # Create if not exist uf.print_out("Producing messages to topic {}. Press Ctrl-C to terminate".format(kafka_topic.name)) # Produce to kafka for distributed consumption hdp_output = open(hadoop_file, "w") with kafka_topic.get_producer() as producer: for event in gen: new_string = dict(eval(event.message.encode("utf-8"))) msg = clean_data(new_string) # We can decide to have logstash read from file instead # with open(logstash_file, 'a') as log_output: # log_output.write(json.dumps(msg) + '\n') # Write to producer. producer.produce(json.dumps(msg)) # Back up to file for HDFS and S3 hdp_output.write(json.dumps(msg) + "\n") if hdp_output.tell() > 100000000: hdp_output.close() uf.print_out("Block {}: Flushing 100MB file to HDFS => {}".format(str(block_cnt), hadoop_fullpath)) # place blocked messages into history and cached folders on hdfs os.system("hdfs dfs -put {} {}".format(hadoop_file, hadoop_fullpath)) os.system("hdfs dfs -put {} {}".format(hadoop_file, cached_fullpath)) # Back up in S3 uf.print_out("Syncing {} to S3 for back up".format(output_dir)) os.system("aws s3 sync {} s3://emmanuel-awa/clean_data_from_elastic".format(output_dir)) # Recreate file handler hadoop_file = os.path.join(output_dir, "hdfs_{}.dat".format(time.strftime("%Y%m%d%H%M%S"))) hdp_output = open(hadoop_file, "w") uf.print_out("Cleaned {} blocks. File size: {}KB".format(block_cnt, hdp_output.tell() / 1000)) block_cnt += 1
class Elastic(LogProvider): def __init__(self, config_file='config.cfg'): super(Elastic, self).__init__() self.percentage=10.0 self.minimum_occurrences=250 # The ConfigParser documentation points out that there's no way to force defaults config option # outside the "DEFAULT" section. config = ConfigParser() config.read(config_file) if not config.has_section('elastic'): config.add_section('elastic') for option, value in {'use_ssl': 'True', 'host': '127.0.0.1', 'version': '2', 'index': 'nxapi', 'doc_type': 'events'}.items(): if not config.has_option('elastic', option): config.set('elastic', option, value) self.version = config.getint('elastic', 'version') self.index = config.get('elastic', 'index') use_ssl = config.getboolean('elastic', 'use_ssl') host = config.get('elastic', 'host') self.doc_type = config.get('elastic', 'doc_type') self.client = connections.create_connection(hosts=[host], use_ssl=use_ssl, index=self.index, version=self.version, doc_type=self.doc_type, timeout=30, retry_on_timeout=True ) Event.init(index=self.index) index = Index(self.index, using=self.client) index.doc_type(Event) self.initialize_search() def initialize_search(self): self.search = Search(using=self.client, index=self.index).extra(size=10000) def export_search(self): return self.search def import_search(self, search): self.search = search def get_filters(self): return self.search.to_dict() def add_filters(self, filters, regexp=False, negative=False): """ Add `filters` to the query. `filters is a dict of the form {'field': value, field2: value2}, but you can also use a list of values instead of a `str`. They'll be added as a _or_ (and not a _and_). :param dict filters: :param bool regexp: :param bool negative: :return: """ # We need to use multi_match, since we get the fields names dynamically. for key, value in filters.items(): if isinstance(value, set): value = list(value) # There is no need to process empty values. if not value: continue if isinstance(value, list): if negative: self.search = self.search.query(Q('bool', must_not=[ reduce(operator.or_, [Q('multi_match', query=v, fields=[key]) for v in value])]) ) else: self.search = self.search.query(Q('bool', must=[ reduce(operator.or_, [Q('multi_match', query=v, fields=[key]) for v in value])]) ) else: if negative: self.search = self.search.query(~Q("multi_match", query=value, fields=[key])) else: self.search = self.search.query(Q("multi_match", query=value, fields=[key])) def get_top(self, field, size=250): """ Get the top values for the given `field` :param str field: the field to filter on :param int size: how many top values to return, top :return dict of int: A structure of the form {value: number_of_hits, value2: numer_of_hits2} """ search = self.search ret = dict() if field in ['uri', 'vers', 'comments', 'server']: field = ''.join((field, '.raw')) if VERSION < (5, 0, 0): self.search = self.search.params(search_type='count', default_operator='AND') else: self.search = self.search.params(search_type='query_then_fetch') # This documented at https://elasticsearch-py.readthedocs.io/en/master/api.html#elasticsearch.Elasticsearch.search # search_type='count' has been deprecated in ES 2.0 self.search.aggs.bucket('TEST', 'terms', field=field) for hit in self.search.execute(ignore_cache=True).aggregations['TEST']['buckets']: ret[hit['key']] = hit['doc_count'] self.search = search return ret def get_relevant_ids(self, fields, percentage=0, minimum_occurrences=0): """ This function is supposed to return the id that are reparteed/present on the `fields`. :param list of str fields: :param float percentage: :param float minimum_occurrences: :return set of int: """ minimum_occurences = minimum_occurrences or self.minimum_occurrences percentage = percentage or self.percentage ret = set() search = self.search ids = set(i['id'] for i in self.search.execute()) # get all possible ID self.search = search for _id in ids: search = self.search self.add_filters({'id': _id}) # Get how many different fields there are for a given `id` data = collections.defaultdict(set) fields_counter = collections.defaultdict(int) for res in self.search.execute(): for field in fields: if res[field] not in data[field]: fields_counter[field] += 1.0 data[field].add(res[field]) # Ignore id that are present on less than 10% of different values of each fields for field, content in data.items(): if len(content) < minimum_occurrences: logging.debug('Discarding id \033[32m%s\033[0m only present %d times.', _id, len(content)) continue _percentage = len(content) / fields_counter[field] * 100.0 if _percentage > percentage: continue logging.debug('Discarding id \033[32m%s\033[0m present in %d%% of different values of the \033[32m%s\033[0m field', _id, _percentage, field) break else: ret.add(_id) self.search = search return ret def reset_filters(self): self.search = Search(using=self.client, index=self.index).extra(size=10000) def get_results(self): """ Return a `Result` object obtained from the execution of the search `self.search`. :return Result: The `Result` object obtained from the execution of the search `self.search`. """ search = self.search result = self.search.scan() self.search = search return result def commit(self): """Process list of dict (yes) and push them to DB """ self.total_objs += len(self.nlist) count = 0 def gen_events(events): dicts = list() for d in events: dicts.extend([{'index': {'_index': 'nxapi', '_type': 'events'}}, d.to_dict()]) yield dicts.pop(-2) yield dicts.pop(-1) events = list() for entry in self.nlist: event = Event(_index=self.index) for key, value in entry.items(): setattr(event, key, value) event.whitelisted = False event.comments = "import on"+str(datetime.datetime.now()) events.append(event) count += 1 try: ret = self.client.bulk(gen_events(events)) ## ToDo parse ret to selectively loop over events to events.save() whatever happens except TransportError as e: logging.warning("We encountered an error trying to continue.") for event in events: event.save(using=self.client) ## ToDo find a way to change the hardcoded 'events' for ES doctype ## elasticsearch_dsl Issue 689 self.total_commits += count logging.debug("Written "+str(self.total_commits)+" events") del self.nlist[0:len(self.nlist)]