def once_a_day(): elastic = Elastic('elastic:9200', 'steam_tmp') log.info('Updating data from Steam API!') games = get_games_db() for game in games: game_id, game_name = int(game[0]), str(game[1]) log.info('Starting the extraction of game: %s - %s', game_id, game_name) try: gm = steam_api.get_game(game_id, 'temporal') log.info('Steam API: successed!') gm.update(steam_spy.get_game(game_id, 'temporal')) log.info('Steam SPY: successed!') gm.update(steam_currency.get_game(game_id, 'temporal')) log.info('Steam Currency: successed!') log.info('Starting insersion in the Elasticsearch') elastic.update(game_id, gm, 'game_tmp') log.info('Finishing insersion in the Elasticsearch') except Exception as error: if type(error) == GameNotFound: log.warning(error) else: log.error(error) time.sleep(300) games.append(game)
def __init__( self, doc_loader, n_words, classify_tweets, minimum_gram_length, max_distance_entities_doc, doc_score_types, ): """Get out doc_analyzer, save the minimum score neccesary for docs and if the event detection module is turned on, initalize the class for that (spinup)""" self.n_words = n_words self.classify_tweets = classify_tweets self.es = Elastic(host=ELASTIC_HOST) self.check_toponym_index() self.pg = PostgreSQL('gfm') super().__init__(self.pg, self.es, doc_score_types, max_distance_entities_doc) if self.classify_tweets == 'bert': self.text_classifier = TextClassifier() self.docs = {} doc_loader_args = (doc_score_types, n_words, minimum_gram_length) from doc_loader import DocLoaderES self.doc_loader = DocLoaderES(*doc_loader_args)
def gzip_to_es(move_per=10000): es = Elastic() def get_labels(): with gzip.open('tweets_labelled.gz', 'rt', encoding='utf-8') as f: for line in f.readlines(): ID, label = line.strip().split('\t') yield ID, label def move_to_db(labels): es_update = [] for ID, label in labels: es_update.append({ 'doc': { 'event_related': True if label == 'yes' else False }, '_index': DOCUMENT_INDEX, '_id': ID, '_op_type': 'update', }) es.bulk_operation(es_update) for i, labels in enumerate(chunker(get_labels(), move_per)): print(i) move_to_db(labels)
def insert_new_games(): elastic = Elastic('elastic:9200', 'steam_est') log.info('Insert new games on Elasticsearch!') fail_id = open("ids_fails.txt", "a") lst1 = get_games_db() lst2 = get_all_games() games = [game for game in lst2 if game not in lst1] for game in games: game_id, game_name = int(game[0]), str(game[1]) log.info('Starting the extraction of game: %s - %s', game_id, game_name) try: game = steam_api.get_game(game_id, 'estastic') log.info('Steam API: successed!') game.update(steam_spy.get_game(game_id, 'estastic')) log.info('Steam SPY: successed!') log.info('Starting insersion in the Elasticsearch') elastic.update(game_id, game, 'game_est') log.info('Finishing insersion in the Elasticsearch') except Exception as error: if type(error) == GameNotFound: log.warning(error) else: log.error(error) time.sleep(300) fail_id.write(str(game_id) + " || " + str(game_name) + "\n")
def export(): es = Elastic() query = {} tweets = es.scroll_through(index=DOCUMENT_INDEX, body=query) n = 1 with gzip.open('tweets.gz', 'wt', encoding='utf-8') as f: for tweet in tweets: if not n % 1000: print(f"{n} - {datetime.now()}") tweet = tweet['_source'] if 'locations' in tweet: n += 1 ID = tweet['id'] text = clean_text(tweet['text'], lower=False) f.write(f'{ID}\t{text}\n')
def once_a_week(): elastic = Elastic('elastic:9200', 'steam_tmp') log.info('Updating data omce a week!') games = get_games_db() for game in games: log.info('Starting the extraction of game: %s - %s', game[0], game[1]) try: gm = youtube_api.get_game(str(game[1]), 'temporal') log.info('Youtube API: successed!') log.info('Starting update in the Elasticsearch') elastic.update(int(game[0]), gm, 'game_tmp') log.info('Finishing update in the Elasticsearch') except Exception as error: if type(error) == GameNotFound: log.warning(error) else: log.error(error) time.sleep(3600) games.append(game)
def analyze_tweets_subbasin(self, subbasin, languages=None): from db.elastic import Elastic es = Elastic() query = { 'query': { 'term': { 'locations.subbasin_ids_9': subbasin } }, 'sort': { 'date': 'asc' } } data = [] tweets = es.scroll_through(index='floods_all', body=query, source=False) for tweet in tweets: detailed_locations = [ loc for loc in tweet['locations'] if loc['type'] in ('town', 'adm5', 'adm4', 'adm3', 'landmark') ] if len(detailed_locations) != 1: continue detailed_location = detailed_locations[0] if subbasin not in detailed_location['subbasin_ids_9']: continue if detailed_location['score'] < .2: continue tweet_lang = tweet['source']['lang'] if languages and tweet_lang not in languages: continue data.append((subbasin, tweet['id'], tweet['date'], tweet['text'], tweet_lang, None)) self.process(data, res_file=subbasin, include_context='hydrology')
def try_fails_id(): elastic = Elastic('elastic:9200', 'steam_est') log.info('Trying insert the fails ids again!') games = open("ids_fails.txt", "r") for game in games: game_id, game_name = game.split(" || ") game_id = int(game_id) log.info('Starting the extraction of game: %s - %s', game_id, game_name) try: game = steam_api.get_game(game_id, 'estastic') log.info('Steam API: successed!') game.update(steam_spy.get_game(game_id, 'estastic')) log.info('Steam SPY: successed!') log.info('Starting insersion in the Elasticsearch') elastic.update(game_id, game, 'game_est') log.info('Finishing insersion in the Elasticsearch') except Exception as error: if type(error) == GameNotFound: log.warning(error) else: log.error(error) time.sleep(300) os.remove("ids_fails.txt")
from db.elastic import Elastic import sys es = Elastic() def remove_field_from_index(index, field): body = { "query": { "bool": { "must": [ { "exists": {"field": field} } ] } } } print(f"removing {es.n_hits(index=index, body=body)} documents from index '{index}'") body.update({ "script": { "inline": f"ctx._source.remove(\"{field}\")" } }) es.update_by_query(index=index, body=body, conflicts='proceed') if __name__ == '__main__': remove_field_from_index(sys.argv[-2], sys.argv[-1])
def __init__(self): self.keywords = self.set_keywords() self.es = Elastic()
def load_docs(self, docs_queue, n_docs_to_unload, start, analysis_length, timestep_length, event_1, event_2, timestep_end_str, is_real_time, datetime=datetime): try: es = Elastic(host=ELASTIC_HOST) pg = PostgreSQL('gfm') doc_analyzer = DocAnalyzer(es, pg, self.doc_score_types, self.n_words, self.minimum_gram_length) spinup_start = start - analysis_length + timestep_length self.load_timestep_es(es, doc_analyzer, docs_queue, n_docs_to_unload, spinup_start, start) timestep = 1 timestep_end = start + timestep * timestep_length while timestep_end < datetime.utcnow(): query_start = timestep_end - timestep_length self.load_timestep_es(es, doc_analyzer, docs_queue, n_docs_to_unload, query_start, timestep_end) timestep_end_str.value = self.encode_dt(timestep_end) timestep += 1 timestep_end = start + timestep * timestep_length event_2.clear() event_1.set() event_2.wait() last_timestep_end = timestep_end - timestep_length is_real_time.value = True while True: timestep_end = datetime.utcnow() sleep = (timedelta(minutes=3) - (timestep_end - last_timestep_end)).total_seconds() if sleep > 0: time.sleep(sleep) timestep_end = datetime.utcnow() self.load_timestep_es(es, doc_analyzer, docs_queue, n_docs_to_unload, last_timestep_end, timestep_end) last_timestep_end = timestep_end timestep_end_str.value = self.encode_dt(timestep_end) event_2.clear() event_1.set() event_2.wait() except Exception as e: raise
}, "publishers": { "type": "keyword", "store": "true" }, "platforms": { "type": "keyword", "store": "true" }, } }, } } try: elastic = Elastic('elastic:9200', 'steam_est') log.info('Elasticsearch connected') log.info('Creating index Steam Estastic on Elasticsearch') elastic.create_index(index_body) log.info('Index Steam Created') games = get_all_games() log.debug(len(games)) for game in games: game_id, game_name = int(game[0]), str(game[1]) log.info('Starting the extraction of game: %s - %s', game_id, game_name) try: game = steam_api.get_game(game_id, 'estastic') log.info('Steam API: successed!') game.update(steam_spy.get_game(game_id, 'estastic')) log.info('Steam SPY: successed!')
def classify(): es = Elastic() classify_per = 10_000 if refresh: remove_field_from_index(DOCUMENT_INDEX, 'event_related') predictor = Predictor() query = { 'query': { "bool": { "must": [ { 'exists': { 'field': 'locations' } } ], "must_not": { 'exists': { 'field': 'event_related' } } } } } n = es.n_hits(index=DOCUMENT_INDEX, body=query) tweets = es.scroll_through(index=DOCUMENT_INDEX, body=query) tweet_subset = [] for i, tweet in enumerate(tweets): if not i % classify_per: print(f"{i}/{n} ({int(i/n*100)}%) - {datetime.now()}") tweet_subset.append(tweet) if len(tweet_subset) == classify_per: IDs = [] examples = [] for tweet in tweet_subset: tweet = tweet['_source'] IDs.append(tweet['id']) example = { "id": tweet['id'], "sentence1": clean_text(tweet['text'], lower=False), "label": 0 } examples.append(example) labels = predictor(examples) es_update = [] for ID, label in zip(IDs, labels): es_update.append({ 'doc': { 'event_related': True if label == 'yes' else False }, '_index': DOCUMENT_INDEX, '_id': ID, '_op_type': 'update', }) es.bulk_operation(es_update) tweet_subset = []
# Name of the PostgreSQL database (lowercase) POSTGRESQL_DB = 'taggs' # Name of the toponym resolution table TOPONYM_RESOLUTION_TABLE = 'toponym_resolution_table' # Refresh time of the realtime geotagging module REAL_TIME_TAGGER_REFRESH_TIME = 300 # sec # Name of the Elasticsearch index with tweets TWEETS_INDEX = 'taggs' # Name of the Elasticsearch index with toponyms TOPONYM_INDEX = 'toponyms' # Update tweets in the database with their locations (flag for testing purposes) UPDATE = False # Connect to databases es_tweets = Elastic() es_toponyms = es_tweets pg_Geotag = PostgreSQL(POSTGRESQL_DB) pg = PostgreSQL(POSTGRESQL_DB) # The functions below are meant to connect to your database. class TweetAnalyzerCustom: # ID = ID of the tweet as str # tweet = { # 'date': '%a %b %d %H:%M:%S +0000 %Y', # 'user': { # 'id': user ID, # 'location': user location, # 'time zone': user time zone, # },