def pandas_group(): """ 从hbase 中读取数据后,使用pandas进行聚合统计 """ with krbContext(using_keytab=True, principal=principal, keytab_file=keytab): conn = KerberosConnection('hbase02.stor', protocol='compact', use_kerberos=True) table = conn.table(cm_table) columns = ["project", "start_time", "end_time", "cpu", "gpu", "memory"] data = table.scan(columns=["infos:%s" % column for column in columns], filter=None) # TODO 强行使用 filters # "SingleColumnValueFilter('info','project',=,'binary:{0}') df = pd.DataFrame(data=[d[1] for d in data]) print(df.columns, len(df.index)) df.columns = [str(column).split(":")[1][:-1] for column in df.columns] df["start_time"] = df["start_time"].apply(float) df["end_time"] = df["end_time"].apply(float) df["cpu"] = df["cpu"].apply(int) df["gpu"] = df["gpu"].apply(int) df["memory"] = df["memory"].apply(int) data = [] project_groups = df.groupby("project", as_index=True) for project, group in project_groups.groups.items(): project_df = df.loc[group] month_groups = project_df.groupby( by=lambda i: timestamp_to_datestr( project_df.loc[i, "start_time"], project), as_index=True) df_sum = month_groups.sum() print(df_sum.index) for month, row in df_sum.iterrows(): start_time = 0 if pd.isna( row["start_time"]) else row["start_time"] end_time = 0 if pd.isna(row["end_time"]) else row["end_time"] time_long = (end_time - start_time) / 3600 time_long = 0 if time_long <= 0 else time_long data.append([ str(project)[2:-1], month, time_long, row["gpu"] * time_long, row["cpu"] * time_long, row["memory"] ]) data.sort(key=lambda x: (x[0], x[1])) columns = ["project", "month", "time", "cpu", "gpu", "memory"] df = pd.DataFrame(data=data, columns=columns) writer = pd.ExcelWriter("批任务核时月度统计.xls") df.to_excel(writer, sheet_name="batch_jobs_info", index=False) writer.save() writer.close()
def hbase_connect_demo(): with krbContext(using_keytab=True, principal='*****@*****.**', keytab_file='/houleilei.client.keytab'): connection = KerberosConnection('hbase02.stor', protocol='compact', use_kerberos=True) test_table = connection.table('houleilei:test') # insert test_table.put('row_key_1', {'f1:q1': 'v1'}) # get data print(test_table.row('row_key_1'))
def batch_send_(self, tab_name, li): connection = KerberosConnection( HBASE_HOST, protocol='compact', use_kerberos=True, table_prefix=HBASE_TABLE_PREFIX, table_prefix_separator=HBASE_TABLE_PREFIX_SEPARATOR, sasl_service_name='hbase') tab = connection.table(tab_name) batch = tab.batch() for r_k, i_d in li: batch.put(r_k, i_d) batch.send() connection.close()
def get_hbase_table(table_name, server=None, timeout=None): hbase_server = server or settings.PROMOTRON_HBASE_SERVER hbase_timeout = timeout or settings.PROMOTRON_HBASE_TIMEOUT try: connection = KerberosConnection( hbase_server, protocol='compact', use_kerberos=True, timeout=hbase_timeout, ) return connection.table(table_name) except Exception as e: logger.error("Error getting table from hbase: " + str(e)) raise Exception(str(e)) from e
def get_table(self): try: connection = KerberosConnection( self.server, timeout=self.timeout, use_kerberos=True, protocol="compact", ) except Exception: connection = KerberosConnection( self.server, timeout=self.timeout, use_kerberos=True, protocol="compact", ) return connection.table(self.table_name)
def read_csv_to_hbase(): """ read_csv_to_hbase :return: """ columns = [ "id", "name", "category", "project", "cluster", "creator", "datasets", "result", "status", "start_time", "end_time", "image", "git_url", "git_branch", "git_commit", "command", "cpu", "gpu", "spot", "memory", "gpu_model", "relation_report" ] df = pd.read_csv("/home/holyzing/Desktop/batchjob_info.csv", delimiter=",", header=None) df.columns = columns print(len(df.index)) with krbContext(using_keytab=True, principal=principal, keytab_file=keytab): conn = KerberosConnection('hbase02.stor', protocol='compact', use_kerberos=True) table = conn.table(cm_table) # batch = table.batch() # batch.put() for index, row in df.iterrows(): data = {f"infos:{column}": str(row[column]) for column in columns} data["infos:cpu"] = re.search(r"\d+", data["infos:cpu"]).group() data["infos:gpu"] = re.search(r"\d+", data["infos:gpu"]).group() data["infos:memory"] = re.search(r"\d+", data["infos:memory"]).group() data["infos:spot"] = re.search(r"true|false", data["infos:spot"]).group() data["infos:gpu_model"] = data["infos:gpu_model"].split( ": ")[1][1:-3] table.put(str(index), data) conn.close()
def get_connection(server): connection = KerberosConnection(server, protocol='compact', use_kerberos=True, timeout=300000) return connection
def execute_process(options): spark = pyspark.sql.session.SparkSession \ .builder \ .appName("criar_tabela_pip_investigados") \ .enableHiveSupport() \ .getOrCreate() # Para evitar problemas de memoria causados por broadcast spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) schema_exadata = options['schema_exadata'] schema_exadata_aux = options['schema_exadata_aux'] schema_hbase = options['schema_hbase'] assunto_delito = spark.sql(""" SELECT H.id, CASE WHEN P.nome_delito IS NOT NULL THEN P.nome_delito ELSE H.hierarquia END AS assunto FROM {0}.mmps_assunto_docto H LEFT JOIN {0}.tb_penas_assuntos P ON P.id = H.id """.format(schema_exadata_aux)) assunto_delito.createOrReplaceTempView('assunto_delito') assuntos = spark.sql(""" SELECT asdo_docu_dk, concat_ws(' --- ', collect_list(assunto)) as assuntos FROM {0}.mcpr_assunto_documento JOIN assunto_delito ON id = asdo_assu_dk WHERE asdo_dt_fim IS NULL GROUP BY asdo_docu_dk """.format(schema_exadata)) assuntos.createOrReplaceTempView('assuntos') representantes_investigados = spark.sql(""" SELECT DISTINCT representante_dk FROM {1}.tb_pip_investigados_representantes JOIN {0}.mcpr_personagem ON pess_dk = pers_pess_dk WHERE pers_tppe_dk IN (290, 7, 21, 317, 20, 14, 32, 345, 40, 5, 24) """.format(schema_exadata, schema_exadata_aux)) representantes_investigados.createOrReplaceTempView( 'representantes_investigados') documentos_pips = spark.sql(""" SELECT R.representante_dk, R.pess_dk, concat_ws(', ', collect_list(tppe_descricao)) as tppe_descricao, concat_ws(', ', collect_list(cast(tppe_dk as int))) as tppe_dk, pers_docu_dk as docu_dk --MIN(CASE WHEN pers_dt_fim <= current_timestamp() THEN 'Data Fim Atingida' ELSE 'Ativo' END) AS status_personagem, --concat_ws(', ', collect_list(pers_dk)) as pers_dk FROM {0}.mcpr_personagem JOIN {0}.mcpr_tp_personagem ON tppe_dk = pers_tppe_dk JOIN {1}.tb_pip_investigados_representantes R ON pers_pess_dk = R.pess_dk JOIN representantes_investigados RI ON RI.representante_dk = R.representante_dk GROUP BY R.representante_dk, R.pess_dk, pers_docu_dk """.format(schema_exadata, schema_exadata_aux)) documentos_pips.createOrReplaceTempView('documentos_pips') spark.catalog.cacheTable('documentos_pips') docs_representantes = spark.sql(""" SELECT DISTINCT docu_dk, representante_dk FROM documentos_pips """) docs_representantes.createOrReplaceTempView('docs_representantes') spark.catalog.cacheTable('docs_representantes') documentos_investigados = spark.sql(""" WITH tb_coautores AS ( SELECT A.docu_dk, A.representante_dk, concat_ws(', ', collect_list(C.pess_nm_pessoa)) as coautores FROM docs_representantes A JOIN docs_representantes B ON A.docu_dk = B.docu_dk AND A.representante_dk != B.representante_dk JOIN {0}.mcpr_pessoa C ON C.pess_dk = B.representante_dk GROUP BY A.docu_dk, A.representante_dk ), ultimos_andamentos AS ( SELECT docu_dk, pcao_dt_andamento, tppr_descricao, row_number() over (partition by docu_dk order by pcao_dt_andamento desc) as nr_and FROM (SELECT DISTINCT docu_dk FROM documentos_pips) p JOIN {0}.mcpr_vista ON vist_docu_dk = docu_dk JOIN {0}.mcpr_andamento ON pcao_vist_dk = vist_dk JOIN {0}.mcpr_sub_andamento ON stao_pcao_dk = pcao_dk JOIN {0}.mcpr_tp_andamento ON stao_tppr_dk = tppr_dk ) SELECT D.representante_dk, coautores, tppe_descricao, tppe_dk, docu_orgi_orga_dk_responsavel as pip_codigo, D.docu_dk, DOCS.docu_nr_mp, DOCS.docu_nr_externo, DOCS.docu_dt_cadastro, --cldc_ds_classe, O.orgi_nm_orgao, --DOCS.docu_tx_etiqueta, assuntos, fsdc_ds_fase, pcao_dt_andamento as dt_ultimo_andamento, tppr_descricao as desc_ultimo_andamento, D.pess_dk, --status_personagem, --pers_dk, cod_pct, cast(substring(cast(D.representante_dk as string), -1, 1) as int) as rep_last_digit FROM documentos_pips D JOIN {0}.mcpr_documento DOCS ON DOCS.docu_dk = D.docu_dk LEFT JOIN tb_coautores CA ON CA.docu_dk = D.docu_dk AND CA.representante_dk = D.representante_dk LEFT JOIN (SELECT * FROM ultimos_andamentos WHERE nr_and = 1) UA ON UA.docu_dk = D.docu_dk JOIN {0}.orgi_orgao O ON orgi_dk = docu_orgi_orga_dk_responsavel --LEFT JOIN {0}.mcpr_classe_docto_mp ON cldc_dk = docu_cldc_dk LEFT JOIN assuntos TASSU ON asdo_docu_dk = D.docu_dk LEFT JOIN {0}.mcpr_fases_documento ON docu_fsdc_dk = fsdc_dk LEFT JOIN {1}.atualizacao_pj_pacote ON id_orgao = docu_orgi_orga_dk_responsavel WHERE docu_tpst_dk != 11 """.format(schema_exadata, schema_exadata_aux)) table_name_procedimentos = options['table_name_procedimentos'] table_name = "{}.{}".format(schema_exadata_aux, table_name_procedimentos) documentos_investigados.repartition("rep_last_digit").write.mode( "overwrite").saveAsTable("temp_table_pip_investigados_procedimentos") spark.catalog.clearCache() temp_table = spark.table("temp_table_pip_investigados_procedimentos") temp_table.repartition(15).write.mode("overwrite").partitionBy( 'rep_last_digit').saveAsTable(table_name) spark.sql("drop table temp_table_pip_investigados_procedimentos") execute_compute_stats(table_name) # Contagem só será utilizada pela PIP e para PIPs, pelo menos por enquanto table = spark.sql(""" WITH DOCS_INVESTIGADOS_FILTERED AS ( SELECT representante_dk, pess_dk, docu_dk, docu_dt_cadastro, pip_codigo, fsdc_ds_fase FROM {1}.tb_pip_investigados_procedimentos WHERE tppe_dk REGEXP '(^| )(290|7|21|317|20|14|32|345|40|5|24)(,|$)' -- apenas os investigados na contagem AND cod_pct IN (200, 201, 202, 203, 204, 205, 206, 207, 208, 209) ), DISTINCT_DOCS_COUNT AS ( SELECT representante_dk, COUNT(DISTINCT docu_dk) as nr_investigacoes, MAX(docu_dt_cadastro) as max_docu_date FROM DOCS_INVESTIGADOS_FILTERED JOIN {0}.mcpr_pessoa P ON P.pess_dk = representante_dk WHERE P.pess_nm_pessoa NOT REGEXP 'IDENTIFICADO|IGNORAD[OA]|APURA[CÇ][AÃ]O' GROUP BY representante_dk ) SELECT pess_nm_pessoa, t.*, MULTI.flag_multipromotoria, TOPN.flag_top50, cast(substring(cast(t.pip_codigo as string), -1, 1) as int) as orgao_last_digit FROM ( SELECT c.representante_dk, pip_codigo, nr_investigacoes FROM DISTINCT_DOCS_COUNT c JOIN ( SELECT representante_dk, pip_codigo FROM DOCS_INVESTIGADOS_FILTERED GROUP BY representante_dk, pip_codigo HAVING SUM(CASE WHEN fsdc_ds_fase = "Em Andamento" THEN 1 ELSE 0 END) > 0 ) r ON c.representante_dk = r.representante_dk) t LEFT JOIN ( SELECT representante_dk, True as flag_multipromotoria FROM DOCS_INVESTIGADOS_FILTERED GROUP BY representante_dk HAVING COUNT(DISTINCT pip_codigo) > 1 ) MULTI ON MULTI.representante_dk = t.representante_dk LEFT JOIN ( SELECT representante_dk, True as flag_top50 FROM DISTINCT_DOCS_COUNT ORDER BY nr_investigacoes DESC, max_docu_date DESC LIMIT 50 ) TOPN ON TOPN.representante_dk = t.representante_dk JOIN {0}.mcpr_pessoa ON pess_dk = t.representante_dk """.format(schema_exadata, schema_exadata_aux)) table_name_investigados = options['table_name_investigados'] table_name = "{}.{}".format(schema_exadata_aux, table_name_investigados) table.repartition('orgao_last_digit').write.mode("overwrite").saveAsTable( "temp_table_pip_investigados") temp_table = spark.table("temp_table_pip_investigados") temp_table.repartition(10).write.mode("overwrite").partitionBy( 'orgao_last_digit').saveAsTable(table_name) spark.sql("drop table temp_table_pip_investigados") execute_compute_stats(table_name) # Investigados que aparecem em documentos novos reiniciam flags no HBase table_name_dt_checked = options['table_name_dt_checked'] is_exists_dt_checked = check_table_exists(spark, schema_exadata_aux, table_name_dt_checked) current_time = datetime.now() if not is_exists_dt_checked: new_names = spark.sql(""" SELECT pip_codigo, collect_list(representante_dk) as representantes FROM {0}.tb_pip_investigados_procedimentos WHERE docu_dt_cadastro > '{1}' GROUP BY pip_codigo """.format(schema_exadata_aux, str(current_time - timedelta(minutes=20)))).collect() else: new_names = spark.sql(""" SELECT pip_codigo, collect_list(representante_dk) as representantes FROM {0}.tb_pip_investigados_procedimentos JOIN {0}.dt_checked_investigados WHERE docu_dt_cadastro > dt_ultima_verificacao GROUP BY pip_codigo """.format(schema_exadata_aux)).collect() conn = KerberosConnection( 'bda1node05.pgj.rj.gov.br', timeout=3000, use_kerberos=True, protocol="compact", ) #conn = Connection('bda1node05.pgj.rj.gov.br') t = conn.table('{}:pip_investigados_flags'.format(schema_hbase)) for orgao in new_names: orgao_id = str(orgao['pip_codigo']) representantes = orgao['representantes'] removed_rows = t.scan( row_prefix=orgao_id, filter=("DependentColumnFilter ('flags', 'is_removed')")) for row in removed_rows: if row[1].get("identificacao:representante_dk") and row[1][ "identificacao:representante_dk"].decode( 'utf-8') in representantes: t.delete(row[0]) # Usa tabela para guardar a data de ultima verificacao de novos documentos tb_ultima_verificacao = spark.sql(""" SELECT '{0}' as dt_ultima_verificacao """.format(str(current_time))) table_name = "{}.{}".format(schema_exadata_aux, table_name_dt_checked) tb_ultima_verificacao.write.mode("overwrite").saveAsTable(table_name) execute_compute_stats(table_name)
def batch_send(self, li, tab_name, row_key, item_data, batch_size): li.append((row_key, item_data)) #print(len(li)) if len(li) >= batch_size: try: connection = KerberosConnection( HBASE_HOST, protocol='compact', use_kerberos=True, table_prefix=HBASE_TABLE_PREFIX, table_prefix_separator=HBASE_TABLE_PREFIX_SEPARATOR, sasl_service_name='hbase') tab = connection.table(tab_name) batch = tab.batch() for r_k, i_d in li: batch.put(r_k, i_d) batch.send() connection.close() except Exception as e: connection = KerberosConnection( HBASE_HOST, protocol='compact', use_kerberos=True, table_prefix=HBASE_TABLE_PREFIX, table_prefix_separator=HBASE_TABLE_PREFIX_SEPARATOR, sasl_service_name='hbase') schema = {'info': dict(max_versions=3)} connection.create_table(tab_name, schema) tab = connection.table(tab_name) batch = tab.batch() for r_k, i_d in li: batch.put(r_k, i_d) batch.send() connection.close() finally: del li[:]