def run_thematic_dataset_indexing(dataset: MetadataDataset): indexed_resources_ids = [] metadata_dataset = "" if len(dataset.resources) == 0: return [] for metadata in [dataset.title, dataset.notes, dataset.tags]: metadata_dataset += f"{metadata} " metadata_resources = "" if solr_dataset.search(f'id:{dataset.id}', fl='id').__len__() != 0: log.info(f"Atualizando recursos e conjunto de dados") solr_resource.delete(f"package_id:{dataset.id}") solr_dataset.delete(f"id:{dataset.id}") for resource in dataset.resources: log.info('id_resource: ' + resource.id) metadata_resource = "" for metadata in [resource.name, resource.description]: metadata_resource += f"{metadata} " solr_resource.add({'id': resource.id, 'package_id': resource.package_id, 'metadata': metadata_resource + metadata_dataset}) if not resource.thematic_indexing: indexed_resources_ids.append(resource.id) metadata_resources += f"{metadata_resource} " metadata_dataset += f" {metadata_resources}" solr_dataset.add({'id': dataset.id, 'metadata': metadata_dataset}) return indexed_resources_ids
def run_temporal_dataset_indexing(dataset: MetadataDataset, update_num_package_resources): indexed_resources_ids = [] if update_num_package_resources: update_num_resources(dataset.id, dataset.num_resources) for resource in dataset.resources: log.info('id_resource: ' + resource.id) run_temporal_index(resource, dataset.num_resources, dataset.title, dataset.notes) indexed_resources_ids.append(resource.id) return indexed_resources_ids
def run(task_hour=config.scheduled_hour): while True: day = datetime.datetime.now() download_and_persist_metadata() remove() finish = index(task_hour, day.date()) if finish: day = datetime.datetime.now().date() log.info("Amanhã os metadados serão atualizados.") while datetime.datetime.now( ).hour != task_hour and day == datetime.datetime.now().date(): sleep(1)
def remove(): log.info( '#----------------------------------------------------------------------------------------------#' ) log.info("Removendo recursos que não pertencem mais a base de dados.") while True: results = get_deleted_resources() if not results: break for result in results: delete_spatial_indexes(result[0], result[1]) delete_temporal_indexes(result[0], result[1]) delete_thematic_indexes(result[1])
def delete_thematic_indexes(resource): log.info(f'removendo índice temático de resource {resource.id}') solr_resource.delete(id=resource.id) if len(solr_resource.search(f'package_id:{resource.package_id}', rows=1)) == 0: solr_dataset.delete(id=resource.package_id) return metadata_resource = '' for metadata in [resource.name, resource.description]: metadata_resource += f"{metadata} " doc = solr_dataset.search(f'id:{resource.package_id}').docs doc = doc[0] doc['metadata'] = doc['metadata'].replace(metadata_resource, '') solr_dataset.delete(id=resource.package_id) solr_dataset.add({'id': doc['id'], 'metadata': doc['metadata']})
def types_and_indexes(csv_file: list, driver_: GraphDatabase): """ Existem colunas com, por exemplo, nomes de bairros que são também nomes de municípios, portanto este método busca diminuir a identificação de colunas de “falsos” locais, verificando uma certa quantidade de linhas do CSV e selecionando os tipos de locais(Municípios, UFs, Regiões) que mais apareceram, bem como os índices que indicam onde estão os locais encontrados. """ quant_rows = 0 quant_none_place = 0 quant_rows_with_place_found = 0 list_patterns = [] for row in csv_file: quant_rows += 1 pattern_type = "" pattern_index = "" undefined_type = False for j in range(len(row)): res = None try: res = return_type_place(row[j], driver_) except Exception: log.info("Erro ao verificar se o termo é um local e se possui um tipo." "(índice a partir do 0) i={0}, j={1}".format(str(quant_rows), str(j))) if res: if res != "UNDEFINED": pattern_type = "+".join([pattern_type, res]) pattern_index = "+".join([pattern_index, str(j)]) else: undefined_type = True break if pattern_type: quant_rows_with_place_found += 1 if pattern_type and not undefined_type: list_patterns.append("|".join([pattern_type[1:], pattern_index[1:]])) else: quant_none_place += 1 if list_patterns and quant_rows_with_place_found > quant_none_place: try: m = mode(list_patterns) except StatisticsError as err: m = list_patterns[0] types_and_index = m.split("|") types_in_order = types_and_index[0].split("+") index_cols = types_and_index[1].split("+") log.info("Tipos de lugares e índices encontrados: " + m) return types_in_order, [int(i) for i in index_cols] else: return [], []
def date_parser(text): match = search(text) dates = [] for m in match: # '62.428.073/0001-36' <<< ('0001', 12) try: found = dateparser.search.search_dates( m[0], languages=['pt'], settings={'RELATIVE_BASE': datetime.datetime(1000, 1, 1)}) except OverflowError: log.info(f"date OverflowError") if found and config.min_date_allowed <= found[0][ 1] <= config.max_date_allowed: dates.append((found[0][1], m[1])) return dates
def run_temporal_index(resource, num_resources_package, package_title, package_notes): funcs = [ lambda: (date_parser(resource.name), f"resource.name: {resource.name}"), lambda: (date_parser(resource.description), f"resource.description: {resource.description}"), lambda: (date_parser(package_title), f"package.title: {package_title}"), lambda: (date_parser(package_notes), f"package.notes: {package_notes}"), ] for func in funcs: dates = func() if dates[0]: break # ----------------------------------------------------------------------------------------------------------------- if dates[0]: log.info(f"Datas em {dates[1]}") if len(dates[0]) == 0: interval = None if resource.created: date = datetime.strptime(resource.created, '%Y-%m-%dT%H:%M:%S.%f') interval = (date, date) if interval: log.info(f"Datas em resource.created") elif len(dates[0]) == 1: date = dates[0][0] interval = (date[0], date[0] + relativedelta(months=date[1] - 1, day=31)) \ if date[1] != 0 else (date[0], date[0]) else: dates = [ (date[0], date[0] + relativedelta(months=date[1] - 1, day=31)) if date[1] != 0 else (date[0], date[0]) for date in dates[0] ] least_recent = dates[0][0] last = dates[0][1] for date in dates[1:]: if date[0] < least_recent: least_recent = date[0] if date[1] > last: last = date[1] interval = [least_recent, last] if interval: interval = [i.date() for i in interval] if resource.updated or resource.temporal_indexing: log.info(f"recurso {resource.id} marcado para atualização") delete_temporal_index(resource.id) insert_index(resource.id, interval[0], interval[1], resource.package_id, num_resources_package) log.info(f"intervalo encontrado: {interval}")
def count_places_id(index_cols, types_in_order, len_row, csv_file, places_id_dict, start_csv, lock, quant_indexed_rows): quant_row = 0 not_found_place = 0 quant_indexed_rows_aux = 0 places_id_dict_aux = {} if index_cols: neo4j_index = Neo4jIndex() for row in csv_file: quant_row += 1 if len(row) == len_row: try: res = neo4j_index.find_places_return_id([row[i] for i in index_cols], types_in_order) except CypherSyntaxError: res = None log.info(f"CypherSyntaxError. Erro na linha {quant_row}") if not res and not_found_place < 2: not_found_place += 1 log.info(f"Não foi encontrado 'locais' na linha {quant_row + start_csv}") elif res: quant_indexed_rows_aux += 1 for id_ in res: try: places_id_dict_aux[id_] += 1 except KeyError: places_id_dict_aux[id_] = 1 else: log.info(f"Linha do CSV não corresponde ao tamanho encontrado. Num. linha: {quant_row + start_csv}") lock.acquire() quant_indexed_rows.value += quant_indexed_rows_aux for key in places_id_dict_aux.keys(): try: places_id_dict[key] += places_id_dict_aux[key] except KeyError: places_id_dict[key] = places_id_dict_aux[key] lock.release() del csv_file
def index(update_hour, current_date, num=0): log.info( '#----------------------------------------------------------------------------------------------#' ) log.info("Indexando recursos...") while True: packages = get_dataset(num) now = datetime.datetime.now() if now.hour >= update_hour and now.date() > current_date: log.info("Parando indexação para atualizar metadados.") return False if not packages: log.info("Todos os recursos foram atualizados hoje.") return True for package in packages: resources = package.resources if resources: log.info( '#------------------------------------------------------------------------------------------#' ) log.info(str(num) + ' - ' + package.id) log.info('') log.info( '#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>indexação espacial<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<#' ) package.resources = [ resource for resource in resources if not resource.spatial_indexing ] update_num_package_resources = False if len(package.resources) < len(resources): update_num_package_resources = True spatial_indexing_done( run_spatial_dataset_indexing(package, update_num_package_resources)) log.info('') log.info( '#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>indexação temporal<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<#' ) if not package.temporal_indexing: package.resources = resources temporal_indexing_done( run_temporal_dataset_indexing(package, False), package.id) else: package.resources = [ resource for resource in resources if not resource.temporal_indexing ] update_num_package_resources = False if len(package.resources) < len(resources): update_num_package_resources = True temporal_indexing_done( run_temporal_dataset_indexing( package, update_num_package_resources)) log.info('') log.info( '#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>indexação temática<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<#' ) if not package.thematic_indexing: package.resources = resources thematic_indexing_done( run_thematic_dataset_indexing(package), package.id) for resource in resources: if not resource.thematic_indexing: package.resources = resources thematic_indexing_done( run_thematic_dataset_indexing(package)) break num += 1
def download_and_persist_metadata(): log.info( '#----------------------------------------------------------------------------------------------#' ) log.info("Baixando e persistindo metadados.") log.info(f'Verificando conexão com {url_portal}...') while True: try: assert dados_gov.action.site_read() break except CKANAPIError as err: log.info(err) sleep(10) log.info('conexão estabelecida.') with engine.connect() as conn: conn.execute("UPDATE metadata_resources SET excluded=TRUE;") page = 0 time0 = time() limit = config.metadata['limit'] offset = config.metadata['offset'] while True: log.info(f"Página(até {limit} metadados de recursos): " + str(page)) while True: try: metadata = dados_gov.action.current_package_list_with_resources( limit=limit, offset=offset * page) break except CKANAPIError: log.info( f"Não foi possível recuperar a página {page}. Tentando novamente..." ) sleep(10) page += 1 new_metadata = pd.json_normalize(metadata) dataset = new_metadata[[ 'id', 'maintainer', 'author', 'name', 'url', 'notes', 'metadata_created', 'tags', 'metadata_modified', 'title' ]] resources = pd.json_normalize(metadata, 'resources') resources = resources[[ 'id', 'package_id', 'url', 'description', 'name', 'format', 'created', 'last_modified' ]] num_csv = resources['format'].eq('CSV').astype(int).groupby( resources['package_id']).sum() dataset = pd.merge(dataset, num_csv, left_on='id', right_on='package_id', how='left')\ .rename(columns={'format': 'num_resources'}) resources['spatial_indexing'] = False resources['temporal_indexing'] = False resources['thematic_indexing'] = False resources['updated'] = False resources['excluded'] = False tags = [] organizations = [] organization_id = [] for m in metadata: try: organizations.append(m['organization']['name']) organization_id.append(m['organization']['id']) except TypeError: organizations.append(None) organization_id.append(None) tags.append(", ".join([tag['name'] for tag in m['tags']])) dataset['tags'] = tags dataset['organization_name'] = organizations dataset['organization_id'] = organization_id dataset['temporal_indexing'] = False dataset['thematic_indexing'] = False dataset['portal_id'] = id_portal dataset.to_sql(name='metadata_dataset', con=engine, if_exists='append', index=False) resources.to_sql(name='metadata_resources', con=engine, if_exists='append', index=False) if len(metadata) < limit: break time1 = time() log.info("Tempo para baixar e persistir os metadados: " + str(time1 - time0) + 's') with engine.connect() as conn: num_dataset = conn.execute( "SELECT count(*) FROM metadata_dataset;").fetchone() num_resources = conn.execute( "SELECT count(*) FROM metadata_resources;").fetchone() log.info("Quantidade de conjuntos de dados: " + str(num_dataset)) log.info("Quantidade de recursos: " + str(num_resources)) dados_gov.close()
def delete_spatial_indexes(num_resources, resource): log.info(f'removendo índice espacial de resource {resource.id}') delete_spatial_index(resource.id, driver) update_num_resources(resource.package_id, num_resources, driver)
def run_spatial_dataset_indexing(dataset: MetadataDataset, update_num_package_resources): detector_charset = UniversalDetector() indexed_resources_ids = [] if update_num_package_resources: update_num_resources(dataset.id, dataset.num_resources, driver) for resource in dataset.resources: if path.exists("./spatial_indexing/tmp_csv.csv"): remove("./spatial_indexing/tmp_csv.csv") log.info('id_resource: ' + resource.id + ' url: ' + resource.url) try: with get(resource.url, timeout=config.request_timeout, stream=True) as request: encoding = request.encoding log.info('headers: ' + request.headers.__str__()) if not request.headers['Content-Type'].__contains__('text/html') and \ not request.headers['Content-Type'].__contains__('text/css') and \ not request.headers['Content-Type'].__contains__('text/xml') and \ not request.headers['Content-Type'].__contains__('application/vnd.ms-excel') and \ not request.headers['Content-Type'].__contains__( 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'): with open('./spatial_indexing/tmp_csv.csv', 'wb') as file_contents: detector_charset.reset() for chunk in request.iter_content(chunk_size=config.csv_chunk_size): file_contents.write(chunk) detector_charset.feed(chunk) detector_charset.close() log.info(f'charset para csv encontrado na requisição: {encoding}') log.info(f"charset para csv encontrado em verificação[charset: confiança(max. 1.0)]: " f"{detector_charset.result['encoding']}: {detector_charset.result['confidence']}") if not encoding or detector_charset.result['confidence'] >= 0.9: encoding = detector_charset.result['encoding'] index(resource, dataset.num_resources, encoding) indexed_resources_ids.append(resource.id) except ConnectionError: log.info('Erro de Conexão, ConnectionError') except ReadTimeout: log.info('Erro de Conexão, ReadTimeout') except MissingSchema: log.exception('requests.exceptions.MissingSchema', exc_info=True) except ChunkedEncodingError: log.info('a conexação foi encerrada, ChunkedEncodingError') except ContentDecodingError: log.exception('ContentDecodingError', exc_info=True) except StopIteration: log.info('Chunk vazio') return indexed_resources_ids
def index(resource: MetadataResources, num_package_resources, encoding, quant_process=config.num_cpu_to_index): try: with open('./spatial_indexing/tmp_csv.csv', 'r', encoding=encoding, newline=None) as file_contents: res = analyze_csv(file_contents) if not res: return dialect = res[0] len_row = res[1] types_and_indexes_ = res[2] with Manager() as manager: places_id_dict = manager.dict() quant_indexed_rows = Value('d', 0.0) lock = Lock() time_i = time() while True: csv_file = reader(file_contents.readlines(config.csv_chunk_size), dialect, quoting=QUOTE_ALL) csv_file = list(csv_file) if not csv_file: break # -----------------------------------------------------------------------------------------# types_in_order = types_and_indexes_[0] index_cols = types_and_indexes_[1] len_csv_file = len(csv_file) csv_file_division = int(len_csv_file / quant_process) # -----------------------------------------------------------------------------------------# if quant_indexed_rows.value > 0: log.info(f"Já existiam {quant_indexed_rows.value} linhas indexadas deste resource. " f"O total de linhas indexadas será atualizado") processes = [] for i in range(quant_process): start = csv_file_division * i if i != quant_process - 1: end = csv_file_division * (1 + i) processes.append(Process( target=count_places_id, args=(index_cols, types_in_order, len_row, csv_file[start:end], places_id_dict, start, lock, quant_indexed_rows))) else: processes.append(Process(target=count_places_id, args=( index_cols, types_in_order, len_row, csv_file[start:], places_id_dict, start, lock, quant_indexed_rows))) for process in processes: process.start() for process in processes: process.join() log.info(f"{len_csv_file} linhas foram verificadas") if resource.updated or resource.spatial_indexing: log.info(f"recurso {resource.id} marcado para atualização") delete_spatial_index(resource.id, driver) total_places_references = sum(places_id_dict.values()) for key in places_id_dict: try: insert_into_resource_place(key, resource, num_package_resources, total_places_references, places_id_dict[key], driver) except CypherSyntaxError: log.info(f"CypherSyntaxError. key:{key} quant_places: {places_id_dict[key]} " f"quant_indexed_rows: {quant_indexed_rows.value} " f"resource: {resource.id, resource.package_id}") time_f = time() log.info(f"Quantidades de linhas indexadas do CSV: {quant_indexed_rows.value}") log.info(f"Tempo de indexação(em segundos): {time_f - time_i}") log.info( f"Tempo médio de indexação de linha do CSV(em segundos): " f"{(time_f - time_i) / quant_indexed_rows.value}\n") except UnicodeError or _csv.Error as err: # encoding = 'ISO-8859-1' log.info('Ao decodificar o arquivo, uma exceção ocorreu...') log.info(err)
def analyze_csv(file): try: dialect = Sniffer().sniff(''.join(read_lines_file(file, config.num_lines_to_check_csv_dialect))) log.info(f"delimiter: ({dialect.delimiter}) doublequote: ({dialect.doublequote}) " f"escapechar: ({dialect.escapechar}) " f"lineterminator: ({dialect.lineterminator}) quotechar: ({dialect.quotechar}) " f"quoting: ({dialect.quoting}) " f"skipinitialspace: ({dialect.skipinitialspace})") except _csv.Error: log.info("Não foi possível determinar o delimitador.") return () try: file.seek(0) csv_file = reader(read_lines_file(file, config.num_lines_to_check_type_of_place), dialect, quoting=QUOTE_ALL) csv_file = list(csv_file) except _csv.Error as err: log.info("<><><><><><><><><><><><><><><>") log.info(err) return () # Verifica qual o provável tamanho de cada linha do csv try: len_row = mode([len(x) for x in csv_file[0:]]) log.info(f"tamanho provável da linha: {len_row}") except StatisticsError: log.info('Problema ao verifica qual o provável tamanho de cada linha do csv') return () time_i = time() types_and_indexes_ = types_and_indexes(csv_file, driver) time_f = time() log.info(f"Tempo para verificação de tipos: {time_f - time_i}") if not types_and_indexes_[0]: return () file.seek(0) return dialect, len_row, types_and_indexes_
def delete_temporal_indexes(num_resources, resource): log.info(f'removendo índice temporal de resource {resource.id}') delete_temporal_index(resource.id) update_num_resources(resource.package_id, num_resources)