def task_delete_selected_collections(selected_uuids): """ Task para apagar Coleções Carregadas. @param: - selected_uuids: lista de UUIDs dos documentos a serem removidos Se a lista `selected_uuids` for maior a SLICE_SIZE A lista será fatiada em listas de tamanho: SLICE_SIZE Se a lista `selected_uuids` for < a SLICE_SIZE Será feito uma delete direto no queryset """ stage = 'load' model = 'collection' model_class = LoadCollection get_db_connection() r_queues = RQueues() SLICE_SIZE = 1000 if len(selected_uuids) > SLICE_SIZE: list_of_list_of_uuids = list(chunks(selected_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_delete_selected_collections, uuid_as_string_list) else: # removemos o conjunto de documentos do LoadCollection indicados pelos uuids documents_to_delete = model_class.objects.filter(uuid__in=selected_uuids) documents_to_delete.delete() # convertemos os uuid para _id e filtramos esses documentos no OPAC register_connections() opac_pks = [str(uuid).replace('-', '') for uuid in selected_uuids] with switch_db(opac_models.Collection, OPAC_WEBAPP_DB_NAME) as opac_model: selected_opac_records = opac_model.objects.filter(pk__in=opac_pks) selected_opac_records.delete()
def task_delete_selected_collections_identifiers(selected_ids): """ Task para apagar identificadores de Coleção. @param: - selected_ids: lista de pk dos documentos a serem removidos Se a lista `selected_ids` for maior a SLICE_SIZE A lista será fatiada em listas de tamanho: SLICE_SIZE Se a lista `selected_ids` for < a SLICE_SIZE Será feito uma delete direto no queryset """ model_class = CollectionIdModel get_db_connection() r_queues = RQueues() SLICE_SIZE = 1000 if len(selected_ids) > SLICE_SIZE: list_of_list_of_uuids = list(chunks(selected_ids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue('sync_ids', 'collection', task_delete_selected_collections_identifiers, uuid_as_string_list) else: documents_to_delete = model_class.objects.filter(pk__in=selected_ids) documents_to_delete.delete()
def task_delete_selected_collections(selected_uuids): """ Task para apagar Coleções Transformadas. @param: - selected_uuids: lista de UUIDs dos documentos a serem removidos Se a lista `selected_uuids` for maior a SLICE_SIZE A lista será fatiada em listas de tamanho: SLICE_SIZE Se a lista `selected_uuids` for < a SLICE_SIZE Será feito uma delete direto no queryset """ stage = 'transform' model = 'collection' model_class = TransformCollection get_db_connection() r_queues = RQueues() SLICE_SIZE = 1000 if len(selected_uuids) > SLICE_SIZE: list_of_list_of_uuids = list(chunks(selected_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_delete_selected_collections, uuid_as_string_list) else: documents_to_delete = model_class.objects.filter(uuid__in=selected_uuids) documents_to_delete.delete()
def task_delete_selected_diff_etl_model(stage, model_name, action, selected_uuids): if stage not in ETL_STAGE_LIST: raise ValueError(u'param stage: %s é inválido' % stage) if model_name not in ETL_MODEL_NAME_LIST: raise ValueError(u'param model_name: %s é inválido' % model_name) if action not in ACTION_LIST: raise ValueError(u'param action: %s é inválido' % action) get_db_connection() model_class = DIFF_MODEL_CLASS_BY_NAME[model_name] r_queues = RQueues() SLICE_SIZE = 1000 if len(selected_uuids) > SLICE_SIZE: list_of_list_of_uuids = list(chunks(selected_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue( 'sync_ids', model_name, task_delete_selected_diff_etl_model, stage, model_name, action, uuid_as_string_list) # args da task else: documents_to_delete = model_class.objects.filter(uuid__in=selected_uuids) documents_to_delete.delete()
def task_delete_selected_diff_etl_model(stage, model_name, action, selected_uuids): if stage not in ETL_STAGE_LIST: raise ValueError(u'param stage: %s é inválido' % stage) if model_name not in ETL_MODEL_NAME_LIST: raise ValueError(u'param model_name: %s é inválido' % model_name) if action not in ACTION_LIST: raise ValueError(u'param action: %s é inválido' % action) get_db_connection() model_class = DIFF_MODEL_CLASS_BY_NAME[model_name] r_queues = RQueues() SLICE_SIZE = 1000 if len(selected_uuids) > SLICE_SIZE: list_of_list_of_uuids = list(chunks(selected_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue('sync_ids', model_name, task_delete_selected_diff_etl_model, stage, model_name, action, uuid_as_string_list) # args da task else: documents_to_delete = model_class.objects.filter( uuid__in=selected_uuids) documents_to_delete.delete()
def task_delete_selected_collections_identifiers(selected_ids): """ Task para apagar identificadores de Coleção. @param: - selected_ids: lista de pk dos documentos a serem removidos Se a lista `selected_ids` for maior a SLICE_SIZE A lista será fatiada em listas de tamanho: SLICE_SIZE Se a lista `selected_ids` for < a SLICE_SIZE Será feito uma delete direto no queryset """ model_class = CollectionIdModel get_db_connection() r_queues = RQueues() SLICE_SIZE = 1000 if len(selected_ids) > SLICE_SIZE: list_of_list_of_uuids = list(chunks(selected_ids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue( 'sync_ids', 'collection', task_delete_selected_collections_identifiers, uuid_as_string_list) else: documents_to_delete = model_class.objects.filter(pk__in=selected_ids) documents_to_delete.delete()
def task_consume_diff_update(stage, model_name): """ Task que consume os registros UPDATE dos diff filtrando pelos parametros: - @param stage: fase do ETL - @param model_name: nome do modelo ETL """ if stage not in ETL_STAGE_LIST: raise ValueError(u'param stage: %s é inválido' % stage) if model_name not in ETL_MODEL_NAME_LIST: raise ValueError(u'param model_name: %s é inválido' % model_name) action = 'update' SLICE_SIZE = 1000 r_queues = RQueues() get_db_connection() diff_class = ETL_DIFFERS_BY_MODEL[model_name] diff_class_instance = diff_class() full_uuids_to_process = diff_class_instance.get_uuids_unapplied(stage, action) list_of_list_of_uuids = list(chunks(full_uuids_to_process, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: list_of_uuids_flat = [str(uuid) for uuid in list_of_uuids] logger.info(u'enfilerando: consumo de UUUIDs selecionados (stage:%s, model: %s, action: %s)' % (stage, model_name, action)) r_queues.enqueue( 'sync_ids', model_name, task_differ_apply_for_selected_uuids, stage, model_name, action, list_of_uuids_flat)
def task_retrive_all_articles_ids(): retirever_class = RETRIEVERS_BY_MODEL['article'] retriever_instance = retirever_class() r_queues = RQueues() identifiers = retriever_instance.get_data_source_identifiers() list_of_all_ids = [identifier for identifier in identifiers] list_of_list_of_ids = list(chunks(list_of_all_ids, 1000)) for list_of_ids in list_of_list_of_ids: r_queues.enqueue('sync_ids', 'article', task_retrive_articles_ids_by_chunks, list_of_ids)
def task_retrive_all_articles_ids(): retirever_class = RETRIEVERS_BY_MODEL['article'] retriever_instance = retirever_class() r_queues = RQueues() identifiers = retriever_instance.get_data_source_identifiers() list_of_all_ids = [identifier for identifier in identifiers] list_of_list_of_ids = list(chunks(list_of_all_ids, 1000)) for list_of_ids in list_of_list_of_ids: r_queues.enqueue( 'sync_ids', 'article', task_retrive_articles_ids_by_chunks, list_of_ids)
def task_retrieve_all_news_identifiers(): """ Task para processar Extração de TODOS os registros do modelo: NewsIdModel """ r_queues = RQueues() retriever_instance = NewsIdDataRetriever() identifiers = retriever_instance.get_data_source_identifiers() list_of_all_ids = [identifier for identifier in identifiers] list_of_list_of_ids = list(chunks(list_of_all_ids, 1000)) for list_of_ids in list_of_list_of_ids: r_queues.enqueue('sync_ids', 'news', task_retrieve_selected_news_identifiers, list_of_ids)
def task_retrieve_all_collections_identifiers(): """ Task para processar Extração de TODOS os registros do modelo: CollectionIdModel """ r_queues = RQueues() retriever_instance = CollectionIdDataRetriever() identifiers = retriever_instance.get_data_source_identifiers() list_of_all_ids = [identifier for identifier in identifiers] list_of_list_of_ids = list(chunks(list_of_all_ids, 1000)) for list_of_ids in list_of_list_of_ids: r_queues.enqueue( 'sync_ids', 'collection', task_retrieve_selected_collections_identifiers, list_of_ids)
def enqueue_differ_consumer_tasks(stage='all', model_name='all', action='all'): logger.info(u"iniciando task_differ_apply_for_stage (stage: %s, model_name: %s, action: %s)" % ( stage, model_name, action)) diff_class = None r_queues = RQueues() get_db_connection() if stage == 'all': stages_list = STAGE_LIST elif stage not in STAGE_LIST: raise ValueError('Param: stage: %s com valor inesperado!' % stage) else: stages_list = [stage, ] if model_name == 'all': models_list = MODEL_NAME_LIST else: models_list = [model_name] if action == 'all': actions_list = ACTION_LIST else: actions_list = [action, ] for stage_ in stages_list: for model_ in models_list: for action_ in actions_list: diff_class = DIFFERS[model_] diff_class_instance = diff_class() full_uuids_to_process = diff_class_instance.get_uuids_unapplied(stage, action) list_of_list_of_uuids = list(chunks(full_uuids_to_process, 1000)) for list_of_uuids in list_of_list_of_uuids: list_of_uuids_flat = [str(uuid) for uuid in list_of_uuids] logger.info(u'enfilerando: task_differ_apply_for_selected_uuids(stage:%s, model: %s, action: %s)' % (stage_, model_, action_)) r_queues.enqueue( 'sync_ids', model_, task_differ_apply_for_selected_uuids, stage_, model_, action_, list_of_uuids_flat ) create_sync_event_record( 'sync_ids', model_, 'task_differ_apply_for_selected_uuids', u'Enfilerando task para consumir registros diff: stage %s model: %s: action: %s, quantidade de UUIDS: %s' % ( stage_, model_, action_, len(list_of_uuids_flat)) )
def task_extract_all_articles(uuids=None): """ Task para processar Extração de TODOS os registros do modelo: Article """ get_db_connection() stage = 'extract' model = 'article' r_queues = RQueues() source_ids_model_class = identifiers_models.ArticleIdModel SLICE_SIZE = 1000 list_of_all_uuids = source_ids_model_class.objects.all().values_list('uuid') if len(list_of_all_uuids) <= SLICE_SIZE: uuid_as_string_list = [str(uuid) for uuid in list_of_all_uuids] r_queues.enqueue(stage, model, task_extract_selected_articles, uuid_as_string_list) else: list_of_list_of_uuids = list(chunks(list_of_all_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_extract_selected_articles, uuid_as_string_list)
def task_transform_all_collections(): """ Task para processar Transformação de TODOS os registros do modelo: Collection """ stage = 'transform' model = 'collection' source_ids_model_class = identifiers_models.CollectionIdModel get_db_connection() r_queues = RQueues() SLICE_SIZE = 1000 list_of_all_uuids = source_ids_model_class.objects.all().values_list('uuid') if len(list_of_all_uuids) <= SLICE_SIZE: uuid_as_string_list = [str(uuid) for uuid in list_of_all_uuids] r_queues.enqueue(stage, model, task_transform_selected_collections, uuid_as_string_list) else: list_of_list_of_uuids = list(chunks(list_of_all_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_transform_selected_collections, uuid_as_string_list)
def task_load_all_news(): """ Task para processar Carga de TODOS os registros do modelo: News """ get_db_connection() stage = 'load' model = 'news' r_queues = RQueues() source_ids_model_class = identifiers_models.NewsIdModel SLICE_SIZE = 1000 list_of_all_uuids = source_ids_model_class.objects.all().values_list('uuid') if len(list_of_all_uuids) <= SLICE_SIZE: uuid_as_string_list = [str(uuid) for uuid in list_of_all_uuids] r_queues.enqueue(stage, model, task_load_selected_news, uuid_as_string_list) else: list_of_list_of_uuids = list(chunks(list_of_all_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_load_selected_news, uuid_as_string_list)
def do_reprocess_xml_only(self): processor = self.process_class() list_of_all_uuids = self.model_class.objects.filter(data_model_version='xml').values_list('uuid') SLICE_SIZE = 1000 count_xml_articles = len(list_of_all_uuids) try: if len(list_of_all_uuids) <= SLICE_SIZE: uuid_as_string_list = [str(uuid) for uuid in list_of_all_uuids] processor.selected(uuid_as_string_list) else: list_of_list_of_uuids = list(chunks(list_of_all_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] processor.selected(uuid_as_string_list) except Exception as e: traceback_str = traceback.format_exc() self._trigger_messages(is_error=True, exception_obj=e, traceback_str=traceback_str, items_count=count_xml_articles) else: self._trigger_messages(items_count=count_xml_articles)
def generic_task_enqueue_from_uuid_iterable(stage, model_name, model_class, r_queues, target_fn, ids): if ids is None: # update all collections if model_name == 'article': """ Para os artigos filtramos registros que não tenham alguma data: extract OU transform OU load """ model_instances_uuid_qs = model_class.objects.filter( Q(extract_execution_date=None) | Q(transform_execution_date=None) | Q(load_execution_date=None) ).values_list('uuid') else: model_instances_uuid_qs = model_class.objects.values_list('uuid') list_of_list_of_uuids = list(chunks(model_instances_uuid_qs, 1000)) logger.debug(u"obtive: %s listas de listas de UUIDs" % len(list_of_list_of_uuids)) for list_of_uuids in list_of_list_of_uuids: logger.info("enfilerando este chunk de UUIDs: %s" % list_of_uuids) create_sync_event_record( stage, model_name, target_fn, u'Iniciando enfileramento de todos os registros (%s) do modelo: %s' % (len(list_of_uuids), model_name)) map(lambda uuid: r_queues.enqueue(stage, model_name, target_fn, uuid), list_of_uuids) create_sync_event_record( stage, model_name, target_fn, u'Finalizado enfileramento de todos os registros (%s) do modelo: %s' % (len(list_of_uuids), model_name)) else: create_sync_event_record( stage, model_name, target_fn, u'Inciando enfileramento de %s registros do modelo: %s' % (len(ids), model_name)) for oid in ids: try: obj = model_class.objects.get(pk=oid) uuid = obj.uuid r_queues.enqueue(stage, model_name, target_fn, uuid) except model_class.DoesNotExist as e: logger.error(u'Modelo (%s) não existe: %s. pk: %s' % (model_name, str(e), oid)) create_sync_event_record( stage, model_name, target_fn, u'Finalizando enfileramento de %s registros do modelo: %s' % (len(ids), model_name))
def task_delete_selected_collections(selected_uuids): """ Task para apagar Coleções Carregadas. @param: - selected_uuids: lista de UUIDs dos documentos a serem removidos Se a lista `selected_uuids` for maior a SLICE_SIZE A lista será fatiada em listas de tamanho: SLICE_SIZE Se a lista `selected_uuids` for < a SLICE_SIZE Será feito uma delete direto no queryset """ stage = 'load' model = 'collection' model_class = LoadCollection get_db_connection() r_queues = RQueues() SLICE_SIZE = 1000 if len(selected_uuids) > SLICE_SIZE: list_of_list_of_uuids = list(chunks(selected_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_delete_selected_collections, uuid_as_string_list) else: # removemos o conjunto de documentos do LoadCollection indicados pelos uuids documents_to_delete = model_class.objects.filter( uuid__in=selected_uuids) documents_to_delete.delete() # convertemos os uuid para _id e filtramos esses documentos no OPAC register_connections() opac_pks = [str(uuid).replace('-', '') for uuid in selected_uuids] with switch_db(opac_models.Collection, OPAC_WEBAPP_DB_NAME) as opac_model: selected_opac_records = opac_model.objects.filter(pk__in=opac_pks) selected_opac_records.delete()
def task_load_all_news(): """ Task para processar Carga de TODOS os registros do modelo: News """ get_db_connection() stage = 'load' model = 'news' r_queues = RQueues() source_ids_model_class = identifiers_models.NewsIdModel SLICE_SIZE = 1000 list_of_all_uuids = source_ids_model_class.objects.all().values_list( 'uuid') if len(list_of_all_uuids) <= SLICE_SIZE: uuid_as_string_list = [str(uuid) for uuid in list_of_all_uuids] r_queues.enqueue(stage, model, task_load_selected_news, uuid_as_string_list) else: list_of_list_of_uuids = list(chunks(list_of_all_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_load_selected_news, uuid_as_string_list)
def task_extract_all_articles(uuids=None): """ Task para processar Extração de TODOS os registros do modelo: Article """ get_db_connection() stage = 'extract' model = 'article' r_queues = RQueues() source_ids_model_class = identifiers_models.ArticleIdModel SLICE_SIZE = 1000 list_of_all_uuids = source_ids_model_class.objects.all().values_list( 'uuid') if len(list_of_all_uuids) <= SLICE_SIZE: uuid_as_string_list = [str(uuid) for uuid in list_of_all_uuids] r_queues.enqueue(stage, model, task_extract_selected_articles, uuid_as_string_list) else: list_of_list_of_uuids = list(chunks(list_of_all_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_extract_selected_articles, uuid_as_string_list)