def get_answers(self, **kwargs): if "config_relative_path" in kwargs: my_connector = Es_connector( index=kwargs["index"], config_relative_path=kwargs["config_relative_path"]) else: my_connector = Es_connector(index=kwargs["index"]) wrong_labels = 0 all_ids = self.join_ids(kwargs["questions"]) res = my_connector.search({"query": {"match": {"id_str": all_ids}}}) for question in kwargs["questions"]: question_id = self.classifier.extract_filename_no_ext( question["filename"]) gt_tweet = [ tweet for tweet in res["hits"]["hits"] if tweet["_source"]["id_str"] == question_id ] question["label"] = gt_tweet[0]["_source"][kwargs["gt_session"]] if question["pred_label"] != question["label"]: wrong_labels += 1 # print(json.dumps(kwargs["questions"], indent=4, sort_keys=True)) return kwargs["questions"], wrong_labels
def get_clusters(self, index="test3", word=""): my_connector = Es_connector(index=index) res = my_connector.search({ "size": 1, "query": { "simple_query_string": { "fields": ["text"], "query": word } }, "aggs": { "group_by_cluster": { "terms": { "field": "imagesCluster", "size": 9999 } } } }) # print("Clusters") # print(res['aggregations']['group_by_cluster']['buckets']) clusters = res['aggregations']['group_by_cluster']['buckets'] with open(index + '.json') as f: data = json.load(f) for cluster in clusters: # print(cluster['key']) images = data['duplicates'][cluster['key']] # print(images[0]) cluster['image'] = images[0] cluster['size'] = len(images) # print(clusters) return clusters
def get_ngrams_by_query(self, query="", **kwargs): try: my_connector = Es_connector( index=kwargs["index"], config_relative_path=self.config_relative_path) full_query = { "query": query, "size": 0, "aggs": { "ngrams_count": { "terms": { "field": kwargs["n_size"] + "grams.keyword", "size": kwargs["results_size"] }, "aggs": { "status": { "terms": { "field": kwargs["session"] + ".keyword" } } } } } } return my_connector.search(full_query) except Exception as e: print('Error: ' + str(e)) traceback.print_exc() return {}
def get_tweets(self, index="test3", word=""): my_connector = Es_connector(index=index) # res = my_connector.search({ # "query": { # "simple_query_string": { # "fields": [ # "text" # ], # "query": word # } # } # }) # res = my_connector.bigSearch( # { # "_source": ["text", "id_str", "extended_entities", "user", "created_at", "link"], # "query": { # "simple_query_string": { # "fields": [ # "text" # ], # "query": word # } # } # }) res = my_connector.init_paginatedSearch({ "query": { "simple_query_string": { "fields": ["text"], "query": word } } }) return res
def update_tweets_state_by_event_ngram(self, **kwargs): tweets_connector = Es_connector(index=kwargs["index"], doc_type="tweet") query = { "query": { "bool": { "should": kwargs["target_terms"], "minimum_should_match": 1, "must": [{ "match_phrase": { kwargs["ngramsPropName"]: kwargs["ngram"] } }, { "match": { kwargs["session"]: kwargs["query_label"] } }] } } } return tweets_connector.update_query(query, kwargs["session"], kwargs["new_label"])
def set_tweet_state(self, index, session, tid, val): tweets_connector = Es_connector(index=index, doc_type="tweet") session = 'session_' + session query = {"doc": {session: val}} res = tweets_connector.update(tid, query) return res
def get_sessions(self): my_connector = Es_connector(index=self.sessions_index, doc_type=self.sessions_doc_type) query = {"query": {"match_all": {}}} res = my_connector.search(query) return res
def set_cluster_state(self, index, session, cid, state): tweets_connector = Es_connector(index=index, doc_type="tweet") # All tweets session = 'session_' + session query = {"query": {"term": {"imagesCluster": cid}}} res = tweets_connector.update_query(query, session, state) return res
def update_docs_by_ids(self, docs_matches, pred_labed, config_relative_path=None): if len(docs_matches)>0: if config_relative_path != None: my_connector = Es_connector(index=self.index, config_relative_path=config_relative_path) else: my_connector = Es_connector(index=self.index) query = { "query": { "bool": { "should": docs_matches, "minimum_should_match": 1 } } } original_docs = my_connector.search(query)["hits"]["hits"] if len(original_docs)>0: for doc in original_docs: # my_connector = Es_connector(index=self.index, config_relative_path=config_relative_path)\ my_connector.es.update( index=self.index, doc_type="tweet", id=doc["_id"], body={"doc": { self.session: pred_labed }}, retry_on_conflict=5 )
def set_search_status(self, index, session, state, word): tweets_connector = Es_connector(index=index, doc_type="tweet") session = 'session_' + session query = { "query": { "bool": { "must": { "simple_query_string": { "fields": ["text"], "query": word } }, "filter": { "bool": { "should": [{ "match": { session: "proposed" } }] } } } } } res = tweets_connector.update_query(query, session, state) return res
def get_tweets(self, index, doc_field): my_connector = Es_connector(index=index) all_tweets = [] query = { "_source": [doc_field, "timestamp_ms"], "query": { "exists": { "field": doc_field } } } res = my_connector.init_paginatedSearch(query) sid = res["sid"] scroll_size = res["scroll_size"] # Analyse and process page by page processed_tweets = 0 while scroll_size > 0: tweets = res["results"] all_tweets.extend([{ '_source': { doc_field: self.tknzr.tokenize(tweet["_source"][doc_field]), "timestamp_ms": tweet["_source"]["timestamp_ms"] } } for tweet in tweets]) processed_tweets += scroll_size res = my_connector.loop_paginatedSearch(sid, scroll_size) scroll_size = res["scroll_size"] return all_tweets
def get_event_state_tweets_count(self, index="test3", session="", words="", state="confirmed"): my_connector = Es_connector(index=index) query = { "query": { "bool": { "must": [{ "match": { "text": { "query": words } } }], "filter": { "bool": { "should": [{ "match": { "session_" + session: state } }] } } } } } res = my_connector.count(query) return res['count']
def get_event_tweets(self, index="test3", main_term="", related_terms=""): my_connector = Es_connector(index=index) terms = [] words = main_term + ' ' for t in related_terms: terms.append( {"match": { "text": { "query": t['word'], "boost": t['value'] } }}) words += t['word'] + " " terms.append({"match": {"text": {"query": main_term, "boost": 2}}}) # res = my_connector.search({"query": {"term" : { "text" : word }}}) # query = { # "bool": { # "must": { # "match": { # "text": { # "query": main_term, # "operator": "or" # } # } # }, # "should": terms # } # } query = {"sort": ["_score"], "query": {"bool": {"should": terms}}} # print(query) # res = my_connector.search(query) res = my_connector.init_paginatedSearch(query) return res
def get_tweets_query_state(self, index="test3", word="", state="proposed", session=""): my_connector = Es_connector(index=index) query = { "query": { "bool": { "must": { "simple_query_string": { "fields": ["text"], "query": word } }, "filter": { "bool": { "should": [{ "match": { session: state } }] } } } } } res = my_connector.init_paginatedSearch(query) return res
def get_tweets(session, index, state='confirmed'): # Get all confirmed tweets connector = Es_connector(index=index, doc_type='tweet') query = {"query": {"term": {"session_" + session: state}}} res = connector.bigSearch(query) return res
def update_session_results(self, id, events, impact_data): my_connector = Es_connector(index=self.sessions_index, doc_type=self.sessions_doc_type) res = my_connector.update( id, {"doc": { "events": events, "impact_data": impact_data }}) return res
def get_tweets_state(self, index="test3", session="", state="proposed"): my_connector = Es_connector(index=index) res = my_connector.init_paginatedSearch( {"query": { "term": { "session_" + session: state } }}) return res
def get_similar_docs(self, **kwargs): if len(kwargs["questions"]) == 0: return [] my_connector = Es_connector(index=kwargs["index"]) # , config_relative_path='../') duplicated_docs = [] docs_ids_matches = [{"match": {"id_str": {"query": question["str_id"] }}} for question in kwargs["questions"]] docs_original_textual_content = my_connector.search({ "query": { "bool": { "should": docs_ids_matches, "minimum_should_match": 1, "must": [ { "match": { kwargs["session"]: "proposed" } } ] } } }) for doc in docs_original_textual_content["hits"]["hits"]: query = { "query": { "bool": { "must": [ { "term": { "text.keyword": { "value": doc["_source"][kwargs["text_field"]] } } } ] } } } matching_docs = my_connector.search(query) if matching_docs["hits"]["total"]>1: label = [question for question in kwargs["questions"] if question["str_id"] == doc["_source"]["id_str"]][0]["label"] for dup_doc in matching_docs["hits"]["hits"]: duplicated_docs.append({ "filename": dup_doc["_source"]["id_str"], "label": label, kwargs["text_field"]: dup_doc["_source"][kwargs["text_field"]] }) return duplicated_docs
def get_event_tweets2(self, index="test3", main_term="", related_terms="", cid=0): my_connector = Es_connector(index=index) terms = [] words = main_term + ' ' for t in related_terms: terms.append( {"match": { "text": { "query": t['word'], "boost": t['value'] } }}) words += t['word'] + " " terms.append({"match": {"text": {"query": main_term, "boost": 2}}}) # terms.append({"match": { # "imagesCluster": { # "query": cid # } # }}) # query = { # "query": { # "bool": { # "must": { # "exists": { # "field": "imagesCluster" # } # }, # # "must": { "match": { "imagesCluster" : cid }}, # "should": terms # } # } # } query = { "sort": ["_score"], "query": { "bool": { "should": terms, "minimum_should_match": 1, "must": [{ "match": { "imagesCluster": cid } }] } } } # res = my_connector.bigSearch(query) res = my_connector.init_paginatedSearch(query) return res
def get_event_filter_tweets(self, index="test3", main_term="", related_terms="", state="proposed", session=""): my_connector = Es_connector(index=index) terms = [] words = main_term + ' ' for t in related_terms: terms.append( {"match": { "text": { "query": t['word'], "boost": t['value'] } }}) words += t['word'] + " " terms.append({"match": {"text": {"query": main_term, "boost": 2}}}) # query = { # "sort": [ # "_score" # ], # "query": { # "bool": { # "should": terms # } # } # } query = { "sort": ["_score"], "query": { "bool": { "must": [{ "bool": { "should": terms } }], "filter": { "bool": { "should": [{ "match": { session: state } }] } } } } } res = my_connector.init_paginatedSearch(query) return res
def get_words_count(self, index="test3", words=""): my_connector = Es_connector(index=index) query = { "query": { "simple_query_string": { "fields": ["text"], "query": words } } } res = my_connector.count(query) return res['count']
def export_event(self, index, session): my_connector = Es_connector(index=index) res = my_connector.bigSearch({ "_source": { "excludes": ["session_*"] }, "query": { "term": { "session_" + session: "confirmed" } } }) return res
def remove_tmp_predictions_field(self, **kwargs): my_connector = Es_connector( index=kwargs["index"], doc_type="tweet") # config_relative_path='../') for answer in kwargs["answers"]: res = my_connector.update_by_query( {"query": { "match": { "_id": answer["id"] } }}, "ctx._source.remove('" + kwargs["session"] + "_tmp')")
def get_event_image(self, index="test3", main_term="", related_terms=""): my_connector = Es_connector(index=index) terms = [] words = main_term + ' ' for t in related_terms: terms.append( {"match": { "text": { "query": t['word'], "boost": t['value'] } }}) words += t['word'] + " " terms.append({"match": {"text": {"query": main_term, "boost": 2}}}) # res = my_connector.search({"query": {"term" : { "text" : word }}}) # query = { # "bool": { # "must": { # "match": { # "text": { # "query": main_term, # "operator": "or" # } # } # }, # "should": terms # } # } query = { "size": 1, "_source": [ "id_str", "imagesCluster", "session_Twitter2015", "extended_entities" ], "query": { "bool": { "must": { "exists": { "field": "extended_entities" } }, "should": terms } } } # print(query) res = my_connector.search(query) return res
def generate_ngrams_for_index(self, **kwargs): try: # Get the data for performinga paginated search self.current_thread_percentage = 0 print("Starting") my_connector = Es_connector(index=kwargs["index"]) query = kwargs.get('query', {"query": {"match_all": {}}}) res = my_connector.init_paginatedSearch(query) sid = res["sid"] scroll_size = res["scroll_size"] total = int(res["total"]) # Analyse and process page by page i = 0 total_scrolls = int(total / scroll_size) processed_scrolls = 0 print("from_property:", kwargs['from_property']) while scroll_size > 0: tweets = res["results"] self.gerenate_ngrams_for_tweets( tweets, from_property=kwargs['from_property'], prop=kwargs["prop"], index=kwargs["index"], length=kwargs["length"]) i += 1 res = my_connector.loop_paginatedSearch(sid, scroll_size) scroll_size = res["scroll_size"] processed_scrolls += 1 self.current_thread_percentage = round( processed_scrolls * 100 / total_scrolls, 0) print("Completed: ", self.current_thread_percentage, "%") # Clean it at the end so the clien knows when to end asking for more logs self.current_thread_percentage = 100 return True except Exception as e: print('Error: ' + str(e)) return False
def get_range_count(self, index, start, end): my_connector = Es_connector(index=index) query = { "query": { "range": { "timestamp_ms": { "gt": str(start), "lt": str(end) } } } } print(query) res = my_connector.count(query) return res['count']
def get_big_tweets_scroll(self, index="test3", word=""): my_connector = Es_connector(index=index) res = my_connector.init_paginatedSearch({ "_source": [ "text", "id_str", "extended_entities", "user", "created_at", "link" ], "query": { "simple_query_string": { "fields": ["text"], "query": word } } }) return res
def get_end_date(self, index): my_connector = Es_connector(index=index) res = my_connector.search_size( { "_source": ["@timestamp", "timestamp_ms"], "query": { "match_all": {} }, "sort": [{ "@timestamp": { "order": "desc" } }] }, 1) return res['hits']['hits'][0]['_source']
def clear_tmp_predictions(self, **kwargs): my_connector = Es_connector( index=kwargs["index"], doc_type="tweet") # config_relative_path='../') res = my_connector.update_by_query( { "query": { "exists": { "field": kwargs["session"] + "_tmp" } # e.g. session_lyon2015_test_01_tmp } }, "ctx._source." + kwargs["session"] + "_tmp = 'proposed'" ) #"ctx._source.remove('" + kwargs["session"] + "_tmp')")
def updatePropertyValue(self, **kwargs): tweet = kwargs["tweet"] # cnn = Es_connector(index=kwargs["index"]); # # q = { # "script": { # "inline": "ctx._source." + kwargs["property_name"] + " = params.value", # "lang": "painless", # "params": { # "value": str(kwargs["property_value"]) # } # }, # "query": { # "match": { # "_id": tweet["_id"] # } # } # } # # cnn.es.update_by_query(body=q, doc_type='tweet', index=kwargs["index"]) Es_connector(index=kwargs["index"]).es.update( index=kwargs["index"], doc_type="tweet", id=tweet["_id"], body={"doc": { kwargs["property_name"]: kwargs["property_value"] }}, retry_on_conflict=5)