def task_delete_selected_collections_identifiers(selected_ids): """ Task para apagar identificadores de Coleção. @param: - selected_ids: lista de pk dos documentos a serem removidos Se a lista `selected_ids` for maior a SLICE_SIZE A lista será fatiada em listas de tamanho: SLICE_SIZE Se a lista `selected_ids` for < a SLICE_SIZE Será feito uma delete direto no queryset """ model_class = CollectionIdModel get_db_connection() r_queues = RQueues() SLICE_SIZE = 1000 if len(selected_ids) > SLICE_SIZE: list_of_list_of_uuids = list(chunks(selected_ids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue( 'sync_ids', 'collection', task_delete_selected_collections_identifiers, uuid_as_string_list) else: documents_to_delete = model_class.objects.filter(pk__in=selected_ids) documents_to_delete.delete()
def task_journal_update(ids=None): get_db_connection() stage = 'transform' model = 'journal' r_queues = RQueues() r_queues.create_queues_for_stage(stage) collection = models.TransformCollection.objects.all().first() if ids is None: # update all collections models.TransformJournal.objects.all().update(must_reprocess=True) for journal in models.TransformJournal.objects.all(): issn = journal.get('scielo_issn', False) or \ journal.get('print_issn', False) or \ journal.get('eletronic_issn', False) if not issn: raise ValueError(u'Journal sem issn') r_queues.enqueue( stage, model, task_transform_journal, collection.acronym, issn) else: for oid in ids: try: obj = models.TransformJournal.objects.get(pk=oid) obj.update(must_reprocess=True) obj.reload() issn = obj.get('scielo_issn', False) or \ obj.get('print_issn', False) or \ obj.get('eletronic_issn', False) if not issn: raise ValueError(u'Journal sem issn') r_queues.enqueue( stage, model, task_transform_journal, collection.acronym, issn) except Exception as e: logger.error('models.TransformJournal %s. pk: %s', str(e), oid)
def serial_retriever_article_ids(filepath): get_db_connection() with open(filepath) as fp: logger.info('lendo arquivo: %s', filepath) for line in fp: aid_data = json.loads(line) try: code = aid_data['code'] new_processing_date = parse_date_str_to_datetime_obj(aid_data['processing_date']) art = ArticleIdModel.objects.get(article_pid=code) except ArticleIdModel.DoesNotExist: issn = parse_journal_issn_from_article_code(code) issue_pid = parse_issue_pid_from_article_code(code) new_article_data = { 'journal_issn': issn, 'issue_pid': issue_pid, 'article_pid': code, 'processing_date': aid_data['processing_date'] } new_art = ArticleIdModel(**new_article_data) logger.info('cadastrando novo artigo: %s', aid_data) new_art.save() else: old_processing_date = art.processing_date if old_processing_date != new_processing_date: # update logger.info('atualizando aid: %s', code) art.processing_date = new_processing_date art.save() else: logger.info(u'artigo aid: %s sem mudança de data', code)
def task_delete_selected_collections(selected_uuids): """ Task para apagar Coleções Carregadas. @param: - selected_uuids: lista de UUIDs dos documentos a serem removidos Se a lista `selected_uuids` for maior a SLICE_SIZE A lista será fatiada em listas de tamanho: SLICE_SIZE Se a lista `selected_uuids` for < a SLICE_SIZE Será feito uma delete direto no queryset """ stage = 'load' model = 'collection' model_class = LoadCollection get_db_connection() r_queues = RQueues() SLICE_SIZE = 1000 if len(selected_uuids) > SLICE_SIZE: list_of_list_of_uuids = list(chunks(selected_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_delete_selected_collections, uuid_as_string_list) else: # removemos o conjunto de documentos do LoadCollection indicados pelos uuids documents_to_delete = model_class.objects.filter(uuid__in=selected_uuids) documents_to_delete.delete() # convertemos os uuid para _id e filtramos esses documentos no OPAC register_connections() opac_pks = [str(uuid).replace('-', '') for uuid in selected_uuids] with switch_db(opac_models.Collection, OPAC_WEBAPP_DB_NAME) as opac_model: selected_opac_records = opac_model.objects.filter(pk__in=opac_pks) selected_opac_records.delete()
def task_delete_selected_collections(selected_uuids): """ Task para apagar Coleções Transformadas. @param: - selected_uuids: lista de UUIDs dos documentos a serem removidos Se a lista `selected_uuids` for maior a SLICE_SIZE A lista será fatiada em listas de tamanho: SLICE_SIZE Se a lista `selected_uuids` for < a SLICE_SIZE Será feito uma delete direto no queryset """ stage = 'transform' model = 'collection' model_class = TransformCollection get_db_connection() r_queues = RQueues() SLICE_SIZE = 1000 if len(selected_uuids) > SLICE_SIZE: list_of_list_of_uuids = list(chunks(selected_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_delete_selected_collections, uuid_as_string_list) else: documents_to_delete = model_class.objects.filter(uuid__in=selected_uuids) documents_to_delete.delete()
def task_delete_selected_diff_etl_model(stage, model_name, action, selected_uuids): if stage not in ETL_STAGE_LIST: raise ValueError(u'param stage: %s é inválido' % stage) if model_name not in ETL_MODEL_NAME_LIST: raise ValueError(u'param model_name: %s é inválido' % model_name) if action not in ACTION_LIST: raise ValueError(u'param action: %s é inválido' % action) get_db_connection() model_class = DIFF_MODEL_CLASS_BY_NAME[model_name] r_queues = RQueues() SLICE_SIZE = 1000 if len(selected_uuids) > SLICE_SIZE: list_of_list_of_uuids = list(chunks(selected_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue( 'sync_ids', model_name, task_delete_selected_diff_etl_model, stage, model_name, action, uuid_as_string_list) # args da task else: documents_to_delete = model_class.objects.filter(uuid__in=selected_uuids) documents_to_delete.delete()
def task_article_update(ids=None): get_db_connection() stage = 'transform' model = 'article' r_queues = RQueues() r_queues.create_queues_for_stage(stage) collection = models.TransformCollection.objects.all().first() if ids is None: # update all collections models.TransformArticle.objects.all().update(must_reprocess=True) for article in models.TransformArticle.objects.all(): r_queues.enqueue( stage, model, task_transform_article, collection.acronym, article.pid) else: for oid in ids: try: obj = models.TransformArticle.objects.get(pk=oid) obj.update(must_reprocess=True) obj.reload() r_queues.enqueue( stage, model, task_transform_article, collection.acronym, obj.pid) except Exception as e: logger.error('models.TransformArticle %s. pk: %s', str(e), oid)
def task_consume_diff_update(stage, model_name): """ Task que consume os registros UPDATE dos diff filtrando pelos parametros: - @param stage: fase do ETL - @param model_name: nome do modelo ETL """ if stage not in ETL_STAGE_LIST: raise ValueError(u'param stage: %s é inválido' % stage) if model_name not in ETL_MODEL_NAME_LIST: raise ValueError(u'param model_name: %s é inválido' % model_name) action = 'update' SLICE_SIZE = 1000 r_queues = RQueues() get_db_connection() diff_class = ETL_DIFFERS_BY_MODEL[model_name] diff_class_instance = diff_class() full_uuids_to_process = diff_class_instance.get_uuids_unapplied(stage, action) list_of_list_of_uuids = list(chunks(full_uuids_to_process, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: list_of_uuids_flat = [str(uuid) for uuid in list_of_uuids] logger.info(u'enfilerando: consumo de UUUIDs selecionados (stage:%s, model: %s, action: %s)' % (stage, model_name, action)) r_queues.enqueue( 'sync_ids', model_name, task_differ_apply_for_selected_uuids, stage, model_name, action, list_of_uuids_flat)
def enqueue_full_populate_task_by_model(model_name='all'): logger.info("Inicinado: enqueue_full_populate_task_by_model para modelo: %s", model_name) create_sync_event_record( 'sync_ids', model_name, 'enqueue_full_populate_task_by_model', u'Inciando enfileramento para preencher datas dos registros diff model: %s' % model_name) # setup get_db_connection() stage = 'sync_ids' r_queues = RQueues() model_class = None task_fn = None options = { 'collection': { 'model_class': models.ExtractCollection, 'task_fn': task_populate_collections }, 'journal': { 'model_class': models.ExtractJournal, 'task_fn': task_populate_journals }, 'issue': { 'model_class': models.ExtractIssue, 'task_fn': task_populate_issues }, 'article': { 'model_class': models.ExtractArticle, 'task_fn': task_populate_articles }, 'news': { 'model_class': models.ExtractNews, 'task_fn': task_populate_news }, 'press_release': { 'model_class': models.ExtractPressRelease, 'task_fn': task_populate_press_release } } if model_name == 'all': for k, v in options.items(): model_class = v['model_class'] task_fn = v['task_fn'] logger.info("enfilerando stage: %s model_name: %s model_class: %s" % (stage, k, model_class)) r_queues.enqueue(stage, k, task_fn) elif model_name not in options.keys(): raise ValueError('Param: model_name: %s inesperado' % model_name) else: model_class = options[model_name]['model_class'] task_fn = options[model_name]['task_fn'] logger.info("enfilerando stage: %s model_name: %s model_class: %s" % (stage, model_name, model_class)) r_queues.enqueue(stage, model_name, task_fn) create_sync_event_record( 'sync_ids', model_name, 'enqueue_full_populate_task_by_model', u'Fim do enfileramento para preencher datas dos registros diff model: %s' % model_name) logger.info("Fim: enqueue_full_populate_task_by_model para modelo: %s", model_name)
def task_populate_news(ids=None): get_db_connection() stage = 'sync_ids' model_name = 'news' model_class = models.ExtractNews r_queues = RQueues() target_fn = task_populate_one_news generic_task_enqueue_from_uuid_iterable(stage, model_name, model_class, r_queues, target_fn, ids)
def task_collection_create(): get_db_connection() stage = 'load' model = 'collection' r_queues = RQueues() for collection in models.TransformCollection.objects.all(): r_queues.enqueue(stage, model, task_load_collection, collection.uuid)
def task_populate_press_release(ids=None): get_db_connection() stage = 'sync_ids' model_name = 'press_release' model_class = models.ExtractPressRelease r_queues = RQueues() target_fn = task_populate_one_press_release generic_task_enqueue_from_uuid_iterable(stage, model_name, model_class, r_queues, target_fn, ids)
def task_collection_create(): get_db_connection() stage = 'transform' model = 'collection' r_queues = RQueues() r_queues.create_queues_for_stage(stage) r_queues.enqueue( stage, model, task_transform_collection)
def delete_identifiers(model_name): """função que remove documentos (modelos Identifiers*) para o modelo: `model_name`""" if model_name not in ID_MODEL_CLASS.keys(): raise ValueError(u'parametro: model_name: %s não é válido!' % model_name) get_db_connection() model_class = ID_MODEL_CLASS[model_name] objects = model_class.objects() logger.info(u"Removendo: %s objetos do modelo: %s" % (objects.count(), model_name)) objects.delete() logger.info(u"Objetos removidos com sucesso!")
def task_issue_create(): get_db_connection() stage = 'load' model = 'issue' r_queues = RQueues() for issue in models.TransformIssue.objects.all(): r_queues.enqueue( stage, model, task_load_issue, uuid=issue.uuid)
def task_press_release_create(): get_db_connection() stage = 'load' model = 'press_release' r_queues = RQueues() for press_release in models.TransformPressRelease.objects.all(): r_queues.enqueue( stage, model, task_load_press_release, uuid=press_release.uuid)
def task_transform_selected_journals(selected_uuids): """ Task para processar Transformação de um LISTA de UUIDs do modelo: Journal """ get_db_connection() r_queues = RQueues() source_ids_model_class = identifiers_models.JournalIdModel issns_iter = source_ids_model_class.objects.filter(uuid__in=selected_uuids).values_list('journal_issn') for issn in issns_iter: r_queues.enqueue('transform', 'journal', task_transform_one_journal, issn)
def task_journal_create(): get_db_connection() stage = 'load' model = 'journal' r_queues = RQueues() for journal in models.TransformJournal.objects.all(): r_queues.enqueue( stage, model, task_load_journal, uuid=journal.uuid)
def task_article_create(): get_db_connection() stage = 'load' model = 'article' r_queues = RQueues() for article in models.TransformArticle.objects.all(): r_queues.enqueue( stage, model, task_load_article, uuid=article.uuid)
def task_news_create(): get_db_connection() stage = 'load' model = 'news' r_queues = RQueues() for news in models.TransformNews.objects.all(): r_queues.enqueue( stage, model, task_load_news, uuid=news.uuid)
def task_extract_selected_issues(selected_uuids): """ Task para processar Extração de um LISTA de UUIDs do modelo: Issue """ get_db_connection() r_queues = RQueues() source_ids_model_class = identifiers_models.IssueIdModel pids_iter = source_ids_model_class.objects.filter(uuid__in=selected_uuids).values_list('issue_pid') for issue_pid in pids_iter: r_queues.enqueue('extract', 'issue', task_extract_one_issue, issue_pid)
def task_extract_selected_issues(selected_uuids): """ Task para processar Extração de um LISTA de UUIDs do modelo: Issue """ get_db_connection() r_queues = RQueues() source_ids_model_class = identifiers_models.IssueIdModel pids_iter = source_ids_model_class.objects.filter( uuid__in=selected_uuids).values_list('issue_pid') for issue_pid in pids_iter: r_queues.enqueue('extract', 'issue', task_extract_one_issue, issue_pid)
def task_news_create(): get_db_connection() stage = 'transform' model = 'news' r_queues = RQueues() r_queues.create_queues_for_stage(stage) for news in models.ExtractNews.objects.all(): r_queues.enqueue( stage, model, task_transform_news, news_uuid=news.uuid)
def task_press_release_create(): get_db_connection() stage = 'transform' model = 'press_release' r_queues = RQueues() r_queues.create_queues_for_stage(stage) for pr in models.ExtractPressRelease.objects.all(): r_queues.enqueue( stage, model, task_transform_press_release, press_release_uuid=pr.uuid)
def task_delete_all_articles(): # removemos todos os documentos do modelo Load Article (opac-proc) get_db_connection() all_records = LoadArticle.objects.all() all_records.delete() # removemos todos os documentos do modelo Article (opac) register_connections() with switch_db(opac_models.Article, OPAC_WEBAPP_DB_NAME) as opac_model: all_opac_records = opac_model.objects.all() all_opac_records.delete()
def task_delete_all_journals(): # removemos todos os documentos do modelo Load Journal (opac-proc) get_db_connection() all_records = LoadJournal.objects.all() all_records.delete() # removemos todos os documentos do modelo Journal (opac) register_connections() with switch_db(opac_models.Journal, OPAC_WEBAPP_DB_NAME) as opac_model: all_opac_records = opac_model.objects.all() all_opac_records.delete()
def task_delete_all_news(): get_db_connection() # removemos todos os documentos do modelo Load News (opac-proc) all_records = LoadNews.objects.all() all_records.delete() # removemos todos os documentos do modelo News (opac) register_connections() with switch_db(opac_models.News, OPAC_WEBAPP_DB_NAME) as opac_model: all_opac_records = opac_model.objects.all() all_opac_records.delete()
def task_transform_selected_articles(selected_uuids): """ Task para processar Transformação de um LISTA de UUIDs do modelo: Article """ get_db_connection() r_queues = RQueues() source_ids_model_class = identifiers_models.ArticleIdModel pids_iter = source_ids_model_class.objects.filter(uuid__in=selected_uuids).values_list('article_pid') for article_pid in pids_iter: r_queues.enqueue('transform', 'article', task_transform_one_article, article_pid)
def task_delete_all_press_releases(): # removemos todos os documentos do modelo Load PressRelease (opac-proc) get_db_connection() all_records = LoadPressRelease.objects.all() all_records.delete() # removemos todos os documentos do modelo PressRelease (opac) register_connections() with switch_db(opac_models.PressRelease, OPAC_WEBAPP_DB_NAME) as opac_model: all_opac_records = opac_model.objects.all() all_opac_records.delete()
def task_delete_all_diff_etl_model(stage, model_name, action): get_db_connection() if stage not in ETL_STAGE_LIST: raise ValueError(u'param stage: %s é inválido' % stage) if model_name not in ETL_MODEL_NAME_LIST: raise ValueError(u'param model_name: %s é inválido' % model_name) if action not in ACTION_LIST: raise ValueError(u'param action: %s é inválido' % action) model_class = DIFF_MODEL_CLASS_BY_NAME[model_name] diff_records = model_class.objects.filter(stage=stage, action=action) diff_records.delete()
def task_extract_selected_news(selected_uuids): """ Task para processar Extração de um LISTA de UUIDs do modelo: News """ get_db_connection() r_queues = RQueues() extracted_news_selected = ExtractNews.objects.filter(uuid__in=selected_uuids) for ex_news in extracted_news_selected: r_queues.enqueue('extract', 'news', task_extract_one_news, ex_news.feed_url_used, ex_news.feed_lang)
def task_extract_selected_news(selected_uuids): """ Task para processar Extração de um LISTA de UUIDs do modelo: News """ get_db_connection() r_queues = RQueues() extracted_news_selected = ExtractNews.objects.filter( uuid__in=selected_uuids) for ex_news in extracted_news_selected: r_queues.enqueue('extract', 'news', task_extract_one_news, ex_news.feed_url_used, ex_news.feed_lang)
def task_extract_selected_press_releases(selected_uuids): """ Task para processar Extração de um LISTA de UUIDs do modelo: Press Release """ get_db_connection() r_queues = RQueues() extracted_press_releases_selected = ExtractPressRelease.objects.filter( uuid__in=selected_uuids) for ex_pr in extracted_press_releases_selected: r_queues.enqueue('extract', 'press_release', task_extract_one_press_release, ex_pr.journal_acronym, ex_pr.feed_url_used, ex_pr.feed_lang)
def task_journal_create(): get_db_connection() stage = 'transform' model = 'journal' r_queues = RQueues() r_queues.create_queues_for_stage(stage) collection = models.TransformCollection.objects.all().first() for child in collection.children_ids: r_queues.enqueue( stage, model, task_transform_journal, collection.acronym, child['issn'])
def task_delete_selected_collections(selected_uuids): """ Task para apagar Coleções Carregadas. @param: - selected_uuids: lista de UUIDs dos documentos a serem removidos Se a lista `selected_uuids` for maior a SLICE_SIZE A lista será fatiada em listas de tamanho: SLICE_SIZE Se a lista `selected_uuids` for < a SLICE_SIZE Será feito uma delete direto no queryset """ stage = 'load' model = 'collection' model_class = LoadCollection get_db_connection() r_queues = RQueues() SLICE_SIZE = 1000 if len(selected_uuids) > SLICE_SIZE: list_of_list_of_uuids = list(chunks(selected_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_delete_selected_collections, uuid_as_string_list) else: # removemos o conjunto de documentos do LoadCollection indicados pelos uuids documents_to_delete = model_class.objects.filter( uuid__in=selected_uuids) documents_to_delete.delete() # convertemos os uuid para _id e filtramos esses documentos no OPAC register_connections() opac_pks = [str(uuid).replace('-', '') for uuid in selected_uuids] with switch_db(opac_models.Collection, OPAC_WEBAPP_DB_NAME) as opac_model: selected_opac_records = opac_model.objects.filter(pk__in=opac_pks) selected_opac_records.delete()
def task_extract_all_articles(uuids=None): """ Task para processar Extração de TODOS os registros do modelo: Article """ get_db_connection() stage = 'extract' model = 'article' r_queues = RQueues() source_ids_model_class = identifiers_models.ArticleIdModel SLICE_SIZE = 1000 list_of_all_uuids = source_ids_model_class.objects.all().values_list( 'uuid') if len(list_of_all_uuids) <= SLICE_SIZE: uuid_as_string_list = [str(uuid) for uuid in list_of_all_uuids] r_queues.enqueue(stage, model, task_extract_selected_articles, uuid_as_string_list) else: list_of_list_of_uuids = list(chunks(list_of_all_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_extract_selected_articles, uuid_as_string_list)
def task_load_all_news(): """ Task para processar Carga de TODOS os registros do modelo: News """ get_db_connection() stage = 'load' model = 'news' r_queues = RQueues() source_ids_model_class = identifiers_models.NewsIdModel SLICE_SIZE = 1000 list_of_all_uuids = source_ids_model_class.objects.all().values_list( 'uuid') if len(list_of_all_uuids) <= SLICE_SIZE: uuid_as_string_list = [str(uuid) for uuid in list_of_all_uuids] r_queues.enqueue(stage, model, task_load_selected_news, uuid_as_string_list) else: list_of_list_of_uuids = list(chunks(list_of_all_uuids, SLICE_SIZE)) for list_of_uuids in list_of_list_of_uuids: uuid_as_string_list = [str(uuid) for uuid in list_of_uuids] r_queues.enqueue(stage, model, task_load_selected_news, uuid_as_string_list)
def __init__(self): if self.model_name is None: raise AttributeError(u'Falta definir atributo: model_name') if self.id_model_class is None: raise AttributeError(u'Falta definir atributo: id_model_class') if self.ex_model_class is None: raise AttributeError(u'Falta definir atributo: ex_model_class') if self.tr_model_class is None: raise AttributeError(u'Falta definir atributo: tr_model_class') if self._db is None: self._db = get_db_connection()
def __init__(self): if self.model_name is None: raise AttributeError(u'Falta definir atributo: model_name') if self.idmodel_class is None: raise AttributeError(u'Falta definir atributo: idmodel_class') if self._db is None: self._db = get_db_connection() if self.api_client is None: self.api_client = custom_amapi_client.ArticleMeta( config.ARTICLE_META_THRIFT_DOMAIN, config.ARTICLE_META_THRIFT_PORT, config.ARTICLE_META_THRIFT_TIMEOUT) if self.am_db_api is None: self.am_db_api = AMDBAPI() super(BaseIdDataRetriever, self).__init__()
class Process(object): stage = 'default' collection_acronym = None r_queues = RQueues() db = get_db_connection() def selected(self, selected_uuids): self.r_queues.enqueue(self.stage, self.model_name, self.task_for_selected, selected_uuids) def all(self): self.r_queues.enqueue(self.stage, self.model_name, self.task_for_all) def delete_selected(self, selected_uuids): self.r_queues.enqueue(self.stage, self.model_name, self.task_delete_selected, selected_uuids) def delete_all(self): self.r_queues.enqueue(self.stage, self.model_name, self.task_delete_all)