Exemple #1
0
    def __init__(self):
        cnf = ETLSettings()

        self.index_name = cnf.elastic_index
        self.limit = cnf.etl_size_limit

        self.redis = ETLRedis()
        self.pgbase = ETLPG()
        self.es = ETLElastic()
    def __init__(self):
        cnf = ETLSettings()

        self.producer_table: ETLProducerTable = postgres_table

        self.index_name = cnf.elastic_index
        self.limit = cnf.etl_size_limit

        self.redis = ETLRedis()
        self.pgbase = ETLPG()
        self.es = ETLElastic()
Exemple #3
0
class ETLConsumer:

    def __init__(self):
        cnf = ETLSettings()

        self.index_name = cnf.elastic_index
        self.limit = cnf.etl_size_limit

        self.redis = ETLRedis()
        self.pgbase = ETLPG()
        self.es = ETLElastic()

    def get_filmsid_from_redis(self, putter) -> List[ETLFilmWork]:
        while self.redis.get_status('consumer') == 'run':
            idlists = self.redis.get_filmid_for_work(self.limit)
            films = self.pgbase.get_filmsbyid(tuple(idlists)) if len(idlists) > 0 else []
            putter.send(films)

    @coroutine
    def put_films_to_ES(self) -> bool:
        while True:
            films: List[ETLFilmWork] = (yield)
            logger.info('Start loading data to elastic')
            esfilms = [
                ESMovie(
                    film.id, film.rating, film.imdb_tconst, film.type_name, film.genres,
                    film.title, film.description,
                    [name.split(' : ')[1] for name in film.directors] if film.directors else None,
                    [name.split(' : ')[1] for name in film.actors] if film.actors else None,
                    [name.split(' : ')[1] for name in film.writers] if film.writers else None,
                    [ESPerson(*name.split(' : ')) for name in film.directors] if film.directors else None,
                    [ESPerson(*name.split(' : ')) for name in film.actors] if film.actors else None,
                    [ESPerson(*name.split(' : ')) for name in film.writers] if film.writers else None
                ) for film in films]
            if self.es.bulk_update(esfilms):
                self.redis.del_work_queuename()
                logger.info('Data succesfully loaded, delete working queue')
            else:
                some_sleep(min_sleep_time=1, max_sleep_time=10)

    def start(self):
        if self.redis.get_status('consumer') == 'run':
            logger.warning('ETL Consumer already started, please stop it before run!')
            return
        else:
            self.redis.set_status('consumer', 'run')
            self.es.create_index(self.index_name, esbody)

        putter = self.put_films_to_ES()
        self.get_filmsid_from_redis(putter)

    def stop(self):
        self.redis.set_status('consumer', 'stop')
        logger.info('consumer stopped')
class ETLConsumer:
    def __init__(self):
        cnf = ETLSettings()

        self.producer_table: ETLProducerTable = postgres_table

        self.index_name = cnf.elastic_index
        self.limit = cnf.etl_size_limit

        self.redis = ETLRedis()
        self.pgbase = ETLPG()
        self.es = ETLElastic()

    def worker(self, getfilmsidfromredis,
               getdatafromtable) -> ETLProducerTable:
        while self.redis.get_status('consumer') == 'run':
            getfilmsidfromredis.send(None)
            for table in self.producer_table:
                if table.isESindex:
                    getdatafromtable.send(table)

        if self.redis.get_status('consumer') == 'stop':
            logger.info('consumer stopped by stop signal')

    @coroutine
    def get_data_from_table(self, datatoes) -> dict:
        while True:
            data: ETLProducerTable = (yield)
            datas = {}
            idlists = self.redis.get_tableid_for_work(self.limit, data.table)
            datas['table'] = data
            if (len(idlists) > 0):
                logger.info(f'Get {data.table} id to load to ES')
                if (data.table == 'djfilmgenre'):
                    datas['data'] = self.pgbase.get_genrebyid(tuple(idlists))
                if (len(idlists) > 0) and (data.table == 'djfilmperson'):
                    datas['data'] = self.pgbase.get_personbyid(tuple(idlists))
                datatoes.send(datas)

    @coroutine
    def put_data_to_ES(self):
        while True:
            datas = (yield)
            logger.info(
                f'Start loading to ES index from {datas["table"].table}')
            if datas['table'].table == 'djfilmgenre':
                esdata = [
                    ESGenres(row.id, row.name, row.description)
                    for row in datas['data']
                ]
            if datas['table'].table == 'djfilmperson':
                esdata = [
                    ESPersons(row.id, row.full_name, row.imdb_nconst,
                              row.birth_date, row.death_date, row.role,
                              row.filmids, row.directorsfilmids,
                              row.actorsfilmids, row.writersfilmids)
                    for row in datas['data']
                ]
                pass
            if self.es.bulk_update(
                    ES_INDEXES[datas['table'].ESindexconf]['name'], esdata):
                self.redis.del_work_queuename(datas['table'].table)
                logger.info(
                    f'Data succesfully loaded from  {datas["table"].table}, working queue'
                )
            else:
                some_sleep(min_sleep_time=1, max_sleep_time=10)

    @coroutine
    def get_filmsid_from_redis(self, putfilmtoes) -> List[ETLFilmWork]:
        while True:
            data = (yield)
            logger.info('Get film id to load to ES')
            idlists = self.redis.get_filmid_for_work(self.limit)
            films = self.pgbase.get_filmsbyid(
                tuple(idlists)) if len(idlists) > 0 else []
            putfilmtoes.send(films)

    @coroutine
    def put_films_to_ES(self):
        while True:
            films: List[ETLFilmWork] = (yield)
            logger.info('Start loading film data to elastic')
            esfilms = [
                ESMovie(
                    film.id, film.rating, film.imdb_tconst, film.type_name,
                    film.genres, film.title, film.description,
                    [name.split(' : ')[1]
                     for name in film.directors] if film.directors else None,
                    [name.split(' : ')[1]
                     for name in film.actors] if film.actors else None,
                    [name.split(' : ')[1]
                     for name in film.writers] if film.writers else None,
                    [ESPerson(*name.split(' : '))
                     for name in film.directors] if film.directors else None,
                    [ESPerson(*name.split(' : '))
                     for name in film.actors] if film.actors else None,
                    [ESPerson(*name.split(' : '))
                     for name in film.writers] if film.writers else None)
                for film in films
            ]
            if self.es.bulk_update(self.index_name, esfilms):
                self.redis.del_work_queuename()
                logger.info(
                    'Film data succesfully loaded, delete working queue')
            else:
                some_sleep(min_sleep_time=1, max_sleep_time=10)

    def start(self):
        if self.redis.get_status('consumer') == 'run':
            logger.warning(
                'ETL Consumer already started, please stop it before run!')
            return
        else:
            self.redis.set_status('consumer', 'run')
            self.es.create_index(self.index_name, esbody)
            self.es.create_index(ES_INDEXES['GENRE_INDEX']['name'],
                                 ES_INDEXES['GENRE_INDEX']['body_json'])
            self.es.create_index(ES_INDEXES['PERSON_INDEX']['name'],
                                 ES_INDEXES['PERSON_INDEX']['body_json'])

        # level 2
        putfilmtoes = self.put_films_to_ES()
        datatoes = self.put_data_to_ES()
        # level 1
        getfilmsidfromredis = self.get_filmsid_from_redis(putfilmtoes)
        getdatafromtable = self.get_data_from_table(datatoes)
        # level 0
        self.worker(getfilmsidfromredis, getdatafromtable)

    def stop(self):
        self.redis.set_status('consumer', 'stop')
        logger.info('consumer will be stopped')
Exemple #5
0
class ETLProducer:
    producer_table = [
        ETLProducerTable(table='djfilmwork', isrelation=False),
        ETLProducerTable(table='djfilmperson',
                         field='film_work_id',
                         ptable='djfilmworkperson',
                         pfield='person_id'),
        ETLProducerTable(table='djfilmgenre',
                         field='film_work_id',
                         ptable='djfilmworkgenre',
                         pfield='genre_id'),
        ETLProducerTable(table='djfilmtype',
                         field='id',
                         ptable='djfilmwork',
                         pfield='type_id'),
    ]

    def __init__(self):
        cnf = ETLSettings()
        self.limit = cnf.etl_size_limit
        self.redis = ETLRedis()
        self.pgbase = ETLPG()

    def worker(self, producer):
        """
        Get List of ETLProducerTable and start etl process from django to redis, for each table.
        """
        while self.redis.get_status('producer') == 'run':
            for table in self.producer_table:
                logger.info(f'start processing : {table}')
                producer.send(table)

    @coroutine
    def producer(self, enricher):
        """
        This coroutine get modifed data from producer table, and send it to enricher.
        The state is stored in Redis.
        If no state in Redis, get all data from producer table.
        """
        while True:
            data: ETLProducerTable = (yield)
            lasttime = self.redis.get_lasttime(
                data.table) or self.pgbase.get_first_object_time(data.table)
            idlist = self.pgbase.get_updated_object_id(lasttime, data.table,
                                                       self.limit)
            logger.info(
                f'get new or modifed data from postgress "{data.table}" table')
            try:
                lasttime = self.redis.set_lasttime(data.table,
                                                   idlist[-1].modified)
            except IndexError:
                logger.warning(f'No more new data in {data.table}')
                some_sleep(min_sleep_time=1, max_sleep_time=10)
            idlist = [filmid.id for filmid in idlist]
            enricher.send(ETLEnricherData(data, idlist))

    @coroutine
    def enricher(self):
        """
        Get modified film id from main table.
        If table is main, simple get modifed film id.
        """
        while True:
            data: ETLEnricherData = (yield)
            logger.info(
                f'get film id modifed by {data.table.table} and store it in Redis'
            )
            offset = 0
            isupdatedid = True if len(data.idlist) > 0 else False
            while isupdatedid:
                filmids = (self.pgbase.get_updated_film_id(
                    data.table, tuple(data.idlist), self.limit, offset)
                           if data.table.isrelation else data.idlist)
                [self.redis.push_filmid(id) for id in filmids]
                if (len(filmids) == self.limit) and (data.table.isrelation):
                    offset += self.limit
                else:
                    isupdatedid = False

    def start(self):
        if self.redis.get_status('producer') == 'run':
            logger.warning(
                'ETL Producer already started, please stop it before run!')
            return
        else:
            self.redis.set_status('producer', 'run')

        enricher = self.enricher()
        producer = self.producer(enricher)
        self.worker(producer)

    def stop(self):
        self.redis.set_status('producer', 'stop')
        logger.info('producer stopped')
Exemple #6
0
 def __init__(self):
     cnf = ETLSettings()
     self.limit = cnf.etl_size_limit
     self.redis = ETLRedis()
     self.pgbase = ETLPG()