def __init__(self): cnf = ETLSettings() self.index_name = cnf.elastic_index self.limit = cnf.etl_size_limit self.redis = ETLRedis() self.pgbase = ETLPG() self.es = ETLElastic()
def __init__(self): cnf = ETLSettings() self.producer_table: ETLProducerTable = postgres_table self.index_name = cnf.elastic_index self.limit = cnf.etl_size_limit self.redis = ETLRedis() self.pgbase = ETLPG() self.es = ETLElastic()
class ETLConsumer: def __init__(self): cnf = ETLSettings() self.index_name = cnf.elastic_index self.limit = cnf.etl_size_limit self.redis = ETLRedis() self.pgbase = ETLPG() self.es = ETLElastic() def get_filmsid_from_redis(self, putter) -> List[ETLFilmWork]: while self.redis.get_status('consumer') == 'run': idlists = self.redis.get_filmid_for_work(self.limit) films = self.pgbase.get_filmsbyid(tuple(idlists)) if len(idlists) > 0 else [] putter.send(films) @coroutine def put_films_to_ES(self) -> bool: while True: films: List[ETLFilmWork] = (yield) logger.info('Start loading data to elastic') esfilms = [ ESMovie( film.id, film.rating, film.imdb_tconst, film.type_name, film.genres, film.title, film.description, [name.split(' : ')[1] for name in film.directors] if film.directors else None, [name.split(' : ')[1] for name in film.actors] if film.actors else None, [name.split(' : ')[1] for name in film.writers] if film.writers else None, [ESPerson(*name.split(' : ')) for name in film.directors] if film.directors else None, [ESPerson(*name.split(' : ')) for name in film.actors] if film.actors else None, [ESPerson(*name.split(' : ')) for name in film.writers] if film.writers else None ) for film in films] if self.es.bulk_update(esfilms): self.redis.del_work_queuename() logger.info('Data succesfully loaded, delete working queue') else: some_sleep(min_sleep_time=1, max_sleep_time=10) def start(self): if self.redis.get_status('consumer') == 'run': logger.warning('ETL Consumer already started, please stop it before run!') return else: self.redis.set_status('consumer', 'run') self.es.create_index(self.index_name, esbody) putter = self.put_films_to_ES() self.get_filmsid_from_redis(putter) def stop(self): self.redis.set_status('consumer', 'stop') logger.info('consumer stopped')
class ETLConsumer: def __init__(self): cnf = ETLSettings() self.producer_table: ETLProducerTable = postgres_table self.index_name = cnf.elastic_index self.limit = cnf.etl_size_limit self.redis = ETLRedis() self.pgbase = ETLPG() self.es = ETLElastic() def worker(self, getfilmsidfromredis, getdatafromtable) -> ETLProducerTable: while self.redis.get_status('consumer') == 'run': getfilmsidfromredis.send(None) for table in self.producer_table: if table.isESindex: getdatafromtable.send(table) if self.redis.get_status('consumer') == 'stop': logger.info('consumer stopped by stop signal') @coroutine def get_data_from_table(self, datatoes) -> dict: while True: data: ETLProducerTable = (yield) datas = {} idlists = self.redis.get_tableid_for_work(self.limit, data.table) datas['table'] = data if (len(idlists) > 0): logger.info(f'Get {data.table} id to load to ES') if (data.table == 'djfilmgenre'): datas['data'] = self.pgbase.get_genrebyid(tuple(idlists)) if (len(idlists) > 0) and (data.table == 'djfilmperson'): datas['data'] = self.pgbase.get_personbyid(tuple(idlists)) datatoes.send(datas) @coroutine def put_data_to_ES(self): while True: datas = (yield) logger.info( f'Start loading to ES index from {datas["table"].table}') if datas['table'].table == 'djfilmgenre': esdata = [ ESGenres(row.id, row.name, row.description) for row in datas['data'] ] if datas['table'].table == 'djfilmperson': esdata = [ ESPersons(row.id, row.full_name, row.imdb_nconst, row.birth_date, row.death_date, row.role, row.filmids, row.directorsfilmids, row.actorsfilmids, row.writersfilmids) for row in datas['data'] ] pass if self.es.bulk_update( ES_INDEXES[datas['table'].ESindexconf]['name'], esdata): self.redis.del_work_queuename(datas['table'].table) logger.info( f'Data succesfully loaded from {datas["table"].table}, working queue' ) else: some_sleep(min_sleep_time=1, max_sleep_time=10) @coroutine def get_filmsid_from_redis(self, putfilmtoes) -> List[ETLFilmWork]: while True: data = (yield) logger.info('Get film id to load to ES') idlists = self.redis.get_filmid_for_work(self.limit) films = self.pgbase.get_filmsbyid( tuple(idlists)) if len(idlists) > 0 else [] putfilmtoes.send(films) @coroutine def put_films_to_ES(self): while True: films: List[ETLFilmWork] = (yield) logger.info('Start loading film data to elastic') esfilms = [ ESMovie( film.id, film.rating, film.imdb_tconst, film.type_name, film.genres, film.title, film.description, [name.split(' : ')[1] for name in film.directors] if film.directors else None, [name.split(' : ')[1] for name in film.actors] if film.actors else None, [name.split(' : ')[1] for name in film.writers] if film.writers else None, [ESPerson(*name.split(' : ')) for name in film.directors] if film.directors else None, [ESPerson(*name.split(' : ')) for name in film.actors] if film.actors else None, [ESPerson(*name.split(' : ')) for name in film.writers] if film.writers else None) for film in films ] if self.es.bulk_update(self.index_name, esfilms): self.redis.del_work_queuename() logger.info( 'Film data succesfully loaded, delete working queue') else: some_sleep(min_sleep_time=1, max_sleep_time=10) def start(self): if self.redis.get_status('consumer') == 'run': logger.warning( 'ETL Consumer already started, please stop it before run!') return else: self.redis.set_status('consumer', 'run') self.es.create_index(self.index_name, esbody) self.es.create_index(ES_INDEXES['GENRE_INDEX']['name'], ES_INDEXES['GENRE_INDEX']['body_json']) self.es.create_index(ES_INDEXES['PERSON_INDEX']['name'], ES_INDEXES['PERSON_INDEX']['body_json']) # level 2 putfilmtoes = self.put_films_to_ES() datatoes = self.put_data_to_ES() # level 1 getfilmsidfromredis = self.get_filmsid_from_redis(putfilmtoes) getdatafromtable = self.get_data_from_table(datatoes) # level 0 self.worker(getfilmsidfromredis, getdatafromtable) def stop(self): self.redis.set_status('consumer', 'stop') logger.info('consumer will be stopped')
class ETLProducer: producer_table = [ ETLProducerTable(table='djfilmwork', isrelation=False), ETLProducerTable(table='djfilmperson', field='film_work_id', ptable='djfilmworkperson', pfield='person_id'), ETLProducerTable(table='djfilmgenre', field='film_work_id', ptable='djfilmworkgenre', pfield='genre_id'), ETLProducerTable(table='djfilmtype', field='id', ptable='djfilmwork', pfield='type_id'), ] def __init__(self): cnf = ETLSettings() self.limit = cnf.etl_size_limit self.redis = ETLRedis() self.pgbase = ETLPG() def worker(self, producer): """ Get List of ETLProducerTable and start etl process from django to redis, for each table. """ while self.redis.get_status('producer') == 'run': for table in self.producer_table: logger.info(f'start processing : {table}') producer.send(table) @coroutine def producer(self, enricher): """ This coroutine get modifed data from producer table, and send it to enricher. The state is stored in Redis. If no state in Redis, get all data from producer table. """ while True: data: ETLProducerTable = (yield) lasttime = self.redis.get_lasttime( data.table) or self.pgbase.get_first_object_time(data.table) idlist = self.pgbase.get_updated_object_id(lasttime, data.table, self.limit) logger.info( f'get new or modifed data from postgress "{data.table}" table') try: lasttime = self.redis.set_lasttime(data.table, idlist[-1].modified) except IndexError: logger.warning(f'No more new data in {data.table}') some_sleep(min_sleep_time=1, max_sleep_time=10) idlist = [filmid.id for filmid in idlist] enricher.send(ETLEnricherData(data, idlist)) @coroutine def enricher(self): """ Get modified film id from main table. If table is main, simple get modifed film id. """ while True: data: ETLEnricherData = (yield) logger.info( f'get film id modifed by {data.table.table} and store it in Redis' ) offset = 0 isupdatedid = True if len(data.idlist) > 0 else False while isupdatedid: filmids = (self.pgbase.get_updated_film_id( data.table, tuple(data.idlist), self.limit, offset) if data.table.isrelation else data.idlist) [self.redis.push_filmid(id) for id in filmids] if (len(filmids) == self.limit) and (data.table.isrelation): offset += self.limit else: isupdatedid = False def start(self): if self.redis.get_status('producer') == 'run': logger.warning( 'ETL Producer already started, please stop it before run!') return else: self.redis.set_status('producer', 'run') enricher = self.enricher() producer = self.producer(enricher) self.worker(producer) def stop(self): self.redis.set_status('producer', 'stop') logger.info('producer stopped')
def __init__(self): cnf = ETLSettings() self.limit = cnf.etl_size_limit self.redis = ETLRedis() self.pgbase = ETLPG()