Exemple #1
0
def once_a_day():
    elastic = Elastic('elastic:9200', 'steam_tmp')
    log.info('Updating data from Steam API!')
    games = get_games_db()
    for game in games:
        game_id, game_name = int(game[0]), str(game[1])
        log.info('Starting the extraction of game: %s - %s', game_id,
                 game_name)
        try:
            gm = steam_api.get_game(game_id, 'temporal')
            log.info('Steam API: successed!')
            gm.update(steam_spy.get_game(game_id, 'temporal'))
            log.info('Steam SPY: successed!')
            gm.update(steam_currency.get_game(game_id, 'temporal'))
            log.info('Steam Currency: successed!')
            log.info('Starting insersion in the Elasticsearch')
            elastic.update(game_id, gm, 'game_tmp')
            log.info('Finishing insersion in the Elasticsearch')
        except Exception as error:
            if type(error) == GameNotFound:
                log.warning(error)
            else:
                log.error(error)
            time.sleep(300)
            games.append(game)
 def __init__(
     self,
     doc_loader,
     n_words,
     classify_tweets,
     minimum_gram_length,
     max_distance_entities_doc,
     doc_score_types,
 ):
     """Get out doc_analyzer, save the minimum score neccesary for docs
     and if the event detection module is turned on, initalize the class
     for that (spinup)"""
     self.n_words = n_words
     self.classify_tweets = classify_tweets
     self.es = Elastic(host=ELASTIC_HOST)
     self.check_toponym_index()
     self.pg = PostgreSQL('gfm')
     super().__init__(self.pg, self.es, doc_score_types,
                      max_distance_entities_doc)
     if self.classify_tweets == 'bert':
         self.text_classifier = TextClassifier()
     self.docs = {}
     doc_loader_args = (doc_score_types, n_words, minimum_gram_length)
     from doc_loader import DocLoaderES
     self.doc_loader = DocLoaderES(*doc_loader_args)
Exemple #3
0
def gzip_to_es(move_per=10000):
    es = Elastic()

    def get_labels():
        with gzip.open('tweets_labelled.gz', 'rt', encoding='utf-8') as f:
            for line in f.readlines():
                ID, label = line.strip().split('\t')
                yield ID, label

    def move_to_db(labels):
        es_update = []
        for ID, label in labels:
            es_update.append({
                'doc': {
                    'event_related': True if label == 'yes' else False
                },
                '_index': DOCUMENT_INDEX,
                '_id': ID,
                '_op_type': 'update',
            })

        es.bulk_operation(es_update)

    for i, labels in enumerate(chunker(get_labels(), move_per)):
        print(i)
        move_to_db(labels)
Exemple #4
0
def insert_new_games():
    elastic = Elastic('elastic:9200', 'steam_est')
    log.info('Insert new games on Elasticsearch!')
    fail_id = open("ids_fails.txt", "a")
    lst1 = get_games_db()
    lst2 = get_all_games()
    games = [game for game in lst2 if game not in lst1]
    for game in games:
        game_id, game_name = int(game[0]), str(game[1])
        log.info('Starting the extraction of game: %s - %s', game_id,
                 game_name)
        try:
            game = steam_api.get_game(game_id, 'estastic')
            log.info('Steam API: successed!')
            game.update(steam_spy.get_game(game_id, 'estastic'))
            log.info('Steam SPY: successed!')
            log.info('Starting insersion in the Elasticsearch')
            elastic.update(game_id, game, 'game_est')
            log.info('Finishing insersion in the Elasticsearch')
        except Exception as error:
            if type(error) == GameNotFound:
                log.warning(error)
            else:
                log.error(error)
            time.sleep(300)
            fail_id.write(str(game_id) + " || " + str(game_name) + "\n")
Exemple #5
0
def export():
    es = Elastic()

    query = {}
    tweets = es.scroll_through(index=DOCUMENT_INDEX, body=query)
    n = 1
    with gzip.open('tweets.gz', 'wt', encoding='utf-8') as f:
        for tweet in tweets:
            if not n % 1000:
                print(f"{n} - {datetime.now()}")
            tweet = tweet['_source']
            if 'locations' in tweet:
                n += 1
                ID = tweet['id']
                text = clean_text(tweet['text'], lower=False)
                f.write(f'{ID}\t{text}\n')
Exemple #6
0
def once_a_week():
    elastic = Elastic('elastic:9200', 'steam_tmp')
    log.info('Updating data omce a week!')
    games = get_games_db()
    for game in games:
        log.info('Starting the extraction of game: %s - %s', game[0], game[1])
        try:
            gm = youtube_api.get_game(str(game[1]), 'temporal')
            log.info('Youtube API: successed!')
            log.info('Starting update in the Elasticsearch')
            elastic.update(int(game[0]), gm, 'game_tmp')
            log.info('Finishing update in the Elasticsearch')
        except Exception as error:
            if type(error) == GameNotFound:
                log.warning(error)
            else:
                log.error(error)
        time.sleep(3600)
        games.append(game)
Exemple #7
0
    def analyze_tweets_subbasin(self, subbasin, languages=None):
        from db.elastic import Elastic
        es = Elastic()
        query = {
            'query': {
                'term': {
                    'locations.subbasin_ids_9': subbasin
                }
            },
            'sort': {
                'date': 'asc'
            }
        }

        data = []
        tweets = es.scroll_through(index='floods_all',
                                   body=query,
                                   source=False)
        for tweet in tweets:
            detailed_locations = [
                loc for loc in tweet['locations']
                if loc['type'] in ('town', 'adm5', 'adm4', 'adm3', 'landmark')
            ]
            if len(detailed_locations) != 1:
                continue

            detailed_location = detailed_locations[0]
            if subbasin not in detailed_location['subbasin_ids_9']:
                continue

            if detailed_location['score'] < .2:
                continue

            tweet_lang = tweet['source']['lang']
            if languages and tweet_lang not in languages:
                continue

            data.append((subbasin, tweet['id'], tweet['date'], tweet['text'],
                         tweet_lang, None))
        self.process(data, res_file=subbasin, include_context='hydrology')
Exemple #8
0
def try_fails_id():
    elastic = Elastic('elastic:9200', 'steam_est')
    log.info('Trying insert the fails ids again!')
    games = open("ids_fails.txt", "r")
    for game in games:
        game_id, game_name = game.split(" || ")
        game_id = int(game_id)
        log.info('Starting the extraction of game: %s - %s', game_id,
                 game_name)
        try:
            game = steam_api.get_game(game_id, 'estastic')
            log.info('Steam API: successed!')
            game.update(steam_spy.get_game(game_id, 'estastic'))
            log.info('Steam SPY: successed!')
            log.info('Starting insersion in the Elasticsearch')
            elastic.update(game_id, game, 'game_est')
            log.info('Finishing insersion in the Elasticsearch')
        except Exception as error:
            if type(error) == GameNotFound:
                log.warning(error)
            else:
                log.error(error)
            time.sleep(300)
    os.remove("ids_fails.txt")
Exemple #9
0
from db.elastic import Elastic
import sys

es = Elastic()


def remove_field_from_index(index, field):
    body = {
        "query": {
            "bool": {
                "must": [
                    {
                        "exists": {"field": field}
                    }
                ]
            }
        }
    }
    print(f"removing {es.n_hits(index=index, body=body)} documents from index '{index}'")
    body.update({
        "script": {
            "inline": f"ctx._source.remove(\"{field}\")"
        }
    })
    es.update_by_query(index=index, body=body, conflicts='proceed')


if __name__ == '__main__':
    remove_field_from_index(sys.argv[-2], sys.argv[-1])
Exemple #10
0
 def __init__(self):
     self.keywords = self.set_keywords()
     self.es = Elastic()
Exemple #11
0
    def load_docs(self,
                  docs_queue,
                  n_docs_to_unload,
                  start,
                  analysis_length,
                  timestep_length,
                  event_1,
                  event_2,
                  timestep_end_str,
                  is_real_time,
                  datetime=datetime):
        try:
            es = Elastic(host=ELASTIC_HOST)
            pg = PostgreSQL('gfm')
            doc_analyzer = DocAnalyzer(es, pg, self.doc_score_types,
                                       self.n_words, self.minimum_gram_length)
            spinup_start = start - analysis_length + timestep_length
            self.load_timestep_es(es, doc_analyzer, docs_queue,
                                  n_docs_to_unload, spinup_start, start)

            timestep = 1
            timestep_end = start + timestep * timestep_length

            while timestep_end < datetime.utcnow():
                query_start = timestep_end - timestep_length

                self.load_timestep_es(es, doc_analyzer, docs_queue,
                                      n_docs_to_unload, query_start,
                                      timestep_end)

                timestep_end_str.value = self.encode_dt(timestep_end)
                timestep += 1
                timestep_end = start + timestep * timestep_length

                event_2.clear()
                event_1.set()
                event_2.wait()

            last_timestep_end = timestep_end - timestep_length
            is_real_time.value = True

            while True:
                timestep_end = datetime.utcnow()

                sleep = (timedelta(minutes=3) -
                         (timestep_end - last_timestep_end)).total_seconds()
                if sleep > 0:
                    time.sleep(sleep)
                    timestep_end = datetime.utcnow()

                self.load_timestep_es(es, doc_analyzer, docs_queue,
                                      n_docs_to_unload, last_timestep_end,
                                      timestep_end)
                last_timestep_end = timestep_end
                timestep_end_str.value = self.encode_dt(timestep_end)

                event_2.clear()
                event_1.set()
                event_2.wait()
        except Exception as e:
            raise
Exemple #12
0
                },
                "publishers": {
                    "type": "keyword",
                    "store": "true"
                },
                "platforms": {
                    "type": "keyword",
                    "store": "true"
                },
            }
        },
    }
}

try:
    elastic = Elastic('elastic:9200', 'steam_est')
    log.info('Elasticsearch connected')
    log.info('Creating index Steam Estastic on Elasticsearch')
    elastic.create_index(index_body)
    log.info('Index Steam Created')
    games = get_all_games()
    log.debug(len(games))
    for game in games:
        game_id, game_name = int(game[0]), str(game[1])
        log.info('Starting the extraction of game: %s - %s', game_id,
                 game_name)
        try:
            game = steam_api.get_game(game_id, 'estastic')
            log.info('Steam API: successed!')
            game.update(steam_spy.get_game(game_id, 'estastic'))
            log.info('Steam SPY: successed!')
Exemple #13
0
def classify():
    es = Elastic()

    classify_per = 10_000

    if refresh:
        remove_field_from_index(DOCUMENT_INDEX, 'event_related')

    predictor = Predictor()

    query = {
        'query': {
            "bool": {
                "must": [
                    {
                        'exists': {
                            'field': 'locations'
                        }
                    }
                ],
                "must_not": {
                    'exists': {
                        'field': 'event_related'
                    }
                }
            }
        }
    }
    n = es.n_hits(index=DOCUMENT_INDEX, body=query)
    tweets = es.scroll_through(index=DOCUMENT_INDEX, body=query)
    tweet_subset = []
    for i, tweet in enumerate(tweets):
        if not i % classify_per:
            print(f"{i}/{n} ({int(i/n*100)}%) - {datetime.now()}")
        tweet_subset.append(tweet)

        if len(tweet_subset) == classify_per:
            IDs = []
            examples = []
            for tweet in tweet_subset:
                tweet = tweet['_source']
                IDs.append(tweet['id'])
                example = {
                    "id": tweet['id'],
                    "sentence1": clean_text(tweet['text'], lower=False),
                    "label": 0
                }
                examples.append(example)

            labels = predictor(examples)
            es_update = []
            for ID, label in zip(IDs, labels):
                es_update.append({
                    'doc': {
                        'event_related': True if label == 'yes' else False
                    },
                    '_index': DOCUMENT_INDEX,
                    '_id': ID,
                    '_op_type': 'update',
                })

            es.bulk_operation(es_update)

            tweet_subset = []
Exemple #14
0
# Name of the PostgreSQL database (lowercase)
POSTGRESQL_DB = 'taggs'
# Name of the toponym resolution table
TOPONYM_RESOLUTION_TABLE = 'toponym_resolution_table'
# Refresh time of the realtime geotagging module
REAL_TIME_TAGGER_REFRESH_TIME = 300  # sec
# Name of the Elasticsearch index with tweets
TWEETS_INDEX = 'taggs'
# Name of the Elasticsearch index with toponyms
TOPONYM_INDEX = 'toponyms'

# Update tweets in the database with their locations (flag for testing purposes)
UPDATE = False

# Connect to databases
es_tweets = Elastic()
es_toponyms = es_tweets
pg_Geotag = PostgreSQL(POSTGRESQL_DB)
pg = PostgreSQL(POSTGRESQL_DB)


# The functions below are meant to connect to your database.
class TweetAnalyzerCustom:
    # ID = ID of the tweet as str
    # tweet = {
    #     'date': '%a %b %d %H:%M:%S +0000 %Y',
    #     'user': {
    #                     'id': user ID,
    #                     'location': user location,
    #                     'time zone': user time zone,
    #     },