Exemple #1
0
def pandas_group():
    """
    从hbase 中读取数据后,使用pandas进行聚合统计
    """
    with krbContext(using_keytab=True, principal=principal,
                    keytab_file=keytab):
        conn = KerberosConnection('hbase02.stor',
                                  protocol='compact',
                                  use_kerberos=True)
        table = conn.table(cm_table)
        columns = ["project", "start_time", "end_time", "cpu", "gpu", "memory"]
        data = table.scan(columns=["infos:%s" % column for column in columns],
                          filter=None)
        # TODO 强行使用 filters
        # "SingleColumnValueFilter('info','project',=,'binary:{0}')
        df = pd.DataFrame(data=[d[1] for d in data])
        print(df.columns, len(df.index))
        df.columns = [str(column).split(":")[1][:-1] for column in df.columns]
        df["start_time"] = df["start_time"].apply(float)
        df["end_time"] = df["end_time"].apply(float)
        df["cpu"] = df["cpu"].apply(int)
        df["gpu"] = df["gpu"].apply(int)
        df["memory"] = df["memory"].apply(int)

        data = []
        project_groups = df.groupby("project", as_index=True)
        for project, group in project_groups.groups.items():
            project_df = df.loc[group]
            month_groups = project_df.groupby(
                by=lambda i: timestamp_to_datestr(
                    project_df.loc[i, "start_time"], project),
                as_index=True)
            df_sum = month_groups.sum()
            print(df_sum.index)
            for month, row in df_sum.iterrows():
                start_time = 0 if pd.isna(
                    row["start_time"]) else row["start_time"]
                end_time = 0 if pd.isna(row["end_time"]) else row["end_time"]
                time_long = (end_time - start_time) / 3600
                time_long = 0 if time_long <= 0 else time_long
                data.append([
                    str(project)[2:-1], month, time_long,
                    row["gpu"] * time_long, row["cpu"] * time_long,
                    row["memory"]
                ])
        data.sort(key=lambda x: (x[0], x[1]))
        columns = ["project", "month", "time", "cpu", "gpu", "memory"]
        df = pd.DataFrame(data=data, columns=columns)
        writer = pd.ExcelWriter("批任务核时月度统计.xls")
        df.to_excel(writer, sheet_name="batch_jobs_info", index=False)
        writer.save()
        writer.close()
Exemple #2
0
def hbase_connect_demo():
    with krbContext(using_keytab=True,
                    principal='*****@*****.**',
                    keytab_file='/houleilei.client.keytab'):

        connection = KerberosConnection('hbase02.stor',
                                        protocol='compact',
                                        use_kerberos=True)
        test_table = connection.table('houleilei:test')
        # insert
        test_table.put('row_key_1', {'f1:q1': 'v1'})
        # get data
        print(test_table.row('row_key_1'))
Exemple #3
0
 def batch_send_(self, tab_name, li):
     connection = KerberosConnection(
         HBASE_HOST,
         protocol='compact',
         use_kerberos=True,
         table_prefix=HBASE_TABLE_PREFIX,
         table_prefix_separator=HBASE_TABLE_PREFIX_SEPARATOR,
         sasl_service_name='hbase')
     tab = connection.table(tab_name)
     batch = tab.batch()
     for r_k, i_d in li:
         batch.put(r_k, i_d)
     batch.send()
     connection.close()
def get_hbase_table(table_name, server=None, timeout=None):
    hbase_server = server or settings.PROMOTRON_HBASE_SERVER
    hbase_timeout = timeout or settings.PROMOTRON_HBASE_TIMEOUT

    try:
        connection = KerberosConnection(
            hbase_server,
            protocol='compact',
            use_kerberos=True,
            timeout=hbase_timeout,
        )
        return connection.table(table_name)
    except Exception as e:
        logger.error("Error getting table from hbase: " + str(e))
        raise Exception(str(e)) from e
Exemple #5
0
    def get_table(self):
        try:
            connection = KerberosConnection(
                self.server,
                timeout=self.timeout,
                use_kerberos=True,
                protocol="compact",
            )
        except Exception:
            connection = KerberosConnection(
                self.server,
                timeout=self.timeout,
                use_kerberos=True,
                protocol="compact",
            )

        return connection.table(self.table_name)
Exemple #6
0
def read_csv_to_hbase():
    """
    read_csv_to_hbase
    :return:
    """
    columns = [
        "id", "name", "category", "project", "cluster", "creator", "datasets",
        "result", "status", "start_time", "end_time", "image", "git_url",
        "git_branch", "git_commit", "command", "cpu", "gpu", "spot", "memory",
        "gpu_model", "relation_report"
    ]
    df = pd.read_csv("/home/holyzing/Desktop/batchjob_info.csv",
                     delimiter=",",
                     header=None)
    df.columns = columns
    print(len(df.index))
    with krbContext(using_keytab=True, principal=principal,
                    keytab_file=keytab):
        conn = KerberosConnection('hbase02.stor',
                                  protocol='compact',
                                  use_kerberos=True)
        table = conn.table(cm_table)
        # batch = table.batch()
        # batch.put()
        for index, row in df.iterrows():
            data = {f"infos:{column}": str(row[column]) for column in columns}
            data["infos:cpu"] = re.search(r"\d+", data["infos:cpu"]).group()
            data["infos:gpu"] = re.search(r"\d+", data["infos:gpu"]).group()
            data["infos:memory"] = re.search(r"\d+",
                                             data["infos:memory"]).group()
            data["infos:spot"] = re.search(r"true|false",
                                           data["infos:spot"]).group()
            data["infos:gpu_model"] = data["infos:gpu_model"].split(
                ": ")[1][1:-3]
            table.put(str(index), data)
        conn.close()
Exemple #7
0
def get_connection(server):
    connection = KerberosConnection(server,
                                    protocol='compact',
                                    use_kerberos=True,
                                    timeout=300000)
    return connection
Exemple #8
0
def execute_process(options):

    spark = pyspark.sql.session.SparkSession \
            .builder \
            .appName("criar_tabela_pip_investigados") \
            .enableHiveSupport() \
            .getOrCreate()
    # Para evitar problemas de memoria causados por broadcast
    spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

    schema_exadata = options['schema_exadata']
    schema_exadata_aux = options['schema_exadata_aux']
    schema_hbase = options['schema_hbase']

    assunto_delito = spark.sql("""
        SELECT H.id, CASE WHEN P.nome_delito IS NOT NULL THEN P.nome_delito ELSE H.hierarquia END AS assunto
        FROM {0}.mmps_assunto_docto H
        LEFT JOIN {0}.tb_penas_assuntos P ON P.id = H.id
    """.format(schema_exadata_aux))
    assunto_delito.createOrReplaceTempView('assunto_delito')

    assuntos = spark.sql("""
        SELECT asdo_docu_dk, concat_ws(' --- ', collect_list(assunto)) as assuntos
        FROM {0}.mcpr_assunto_documento
        JOIN assunto_delito ON id = asdo_assu_dk
        WHERE asdo_dt_fim IS NULL
        GROUP BY asdo_docu_dk
    """.format(schema_exadata))
    assuntos.createOrReplaceTempView('assuntos')

    representantes_investigados = spark.sql("""
        SELECT DISTINCT representante_dk
        FROM {1}.tb_pip_investigados_representantes
        JOIN {0}.mcpr_personagem ON pess_dk = pers_pess_dk
        WHERE pers_tppe_dk IN (290, 7, 21, 317, 20, 14, 32, 345, 40, 5, 24)
    """.format(schema_exadata, schema_exadata_aux))
    representantes_investigados.createOrReplaceTempView(
        'representantes_investigados')

    documentos_pips = spark.sql("""
        SELECT 
            R.representante_dk,
            R.pess_dk,
            concat_ws(', ', collect_list(tppe_descricao)) as tppe_descricao,
            concat_ws(', ', collect_list(cast(tppe_dk as int))) as tppe_dk,
            pers_docu_dk as docu_dk
            --MIN(CASE WHEN pers_dt_fim <= current_timestamp() THEN 'Data Fim Atingida' ELSE 'Ativo' END) AS status_personagem,
            --concat_ws(', ', collect_list(pers_dk)) as pers_dk
        FROM {0}.mcpr_personagem
        JOIN {0}.mcpr_tp_personagem ON tppe_dk = pers_tppe_dk
        JOIN {1}.tb_pip_investigados_representantes R ON pers_pess_dk = R.pess_dk
        JOIN representantes_investigados RI ON RI.representante_dk = R.representante_dk
        GROUP BY R.representante_dk, R.pess_dk, pers_docu_dk
    """.format(schema_exadata, schema_exadata_aux))
    documentos_pips.createOrReplaceTempView('documentos_pips')
    spark.catalog.cacheTable('documentos_pips')

    docs_representantes = spark.sql("""
        SELECT DISTINCT docu_dk, representante_dk
        FROM documentos_pips
    """)
    docs_representantes.createOrReplaceTempView('docs_representantes')
    spark.catalog.cacheTable('docs_representantes')

    documentos_investigados = spark.sql("""
        WITH tb_coautores AS (
            SELECT A.docu_dk, A.representante_dk,
                concat_ws(', ', collect_list(C.pess_nm_pessoa)) as coautores
            FROM docs_representantes A
            JOIN docs_representantes B ON A.docu_dk = B.docu_dk AND A.representante_dk != B.representante_dk
            JOIN {0}.mcpr_pessoa C ON C.pess_dk = B.representante_dk
            GROUP BY A.docu_dk, A.representante_dk
        ),
        ultimos_andamentos AS (
            SELECT docu_dk, pcao_dt_andamento, tppr_descricao, row_number() over (partition by docu_dk order by pcao_dt_andamento desc) as nr_and
            FROM (SELECT DISTINCT docu_dk FROM documentos_pips) p
            JOIN {0}.mcpr_vista ON vist_docu_dk = docu_dk
            JOIN {0}.mcpr_andamento ON pcao_vist_dk = vist_dk
            JOIN {0}.mcpr_sub_andamento ON stao_pcao_dk = pcao_dk
            JOIN {0}.mcpr_tp_andamento ON stao_tppr_dk = tppr_dk
        )
        SELECT 
            D.representante_dk,
            coautores,
            tppe_descricao, 
            tppe_dk,
            docu_orgi_orga_dk_responsavel as pip_codigo,
            D.docu_dk,
            DOCS.docu_nr_mp,
            DOCS.docu_nr_externo,
            DOCS.docu_dt_cadastro,
            --cldc_ds_classe,
            O.orgi_nm_orgao,
            --DOCS.docu_tx_etiqueta,
            assuntos,
            fsdc_ds_fase,
            pcao_dt_andamento as dt_ultimo_andamento,
            tppr_descricao as desc_ultimo_andamento,
            D.pess_dk,
            --status_personagem,
            --pers_dk,
            cod_pct,
            cast(substring(cast(D.representante_dk as string), -1, 1) as int) as rep_last_digit
        FROM documentos_pips D
        JOIN {0}.mcpr_documento DOCS ON DOCS.docu_dk = D.docu_dk
        LEFT JOIN tb_coautores CA ON CA.docu_dk = D.docu_dk AND CA.representante_dk = D.representante_dk
        LEFT JOIN (SELECT * FROM ultimos_andamentos WHERE nr_and = 1) UA ON UA.docu_dk = D.docu_dk
        JOIN {0}.orgi_orgao O ON orgi_dk = docu_orgi_orga_dk_responsavel
        --LEFT JOIN {0}.mcpr_classe_docto_mp ON cldc_dk = docu_cldc_dk
        LEFT JOIN assuntos TASSU ON asdo_docu_dk = D.docu_dk
        LEFT JOIN {0}.mcpr_fases_documento ON docu_fsdc_dk = fsdc_dk
        LEFT JOIN {1}.atualizacao_pj_pacote ON id_orgao = docu_orgi_orga_dk_responsavel
        WHERE docu_tpst_dk != 11
    """.format(schema_exadata, schema_exadata_aux))

    table_name_procedimentos = options['table_name_procedimentos']
    table_name = "{}.{}".format(schema_exadata_aux, table_name_procedimentos)
    documentos_investigados.repartition("rep_last_digit").write.mode(
        "overwrite").saveAsTable("temp_table_pip_investigados_procedimentos")
    spark.catalog.clearCache()
    temp_table = spark.table("temp_table_pip_investigados_procedimentos")
    temp_table.repartition(15).write.mode("overwrite").partitionBy(
        'rep_last_digit').saveAsTable(table_name)
    spark.sql("drop table temp_table_pip_investigados_procedimentos")

    execute_compute_stats(table_name)

    # Contagem só será utilizada pela PIP e para PIPs, pelo menos por enquanto
    table = spark.sql("""
        WITH DOCS_INVESTIGADOS_FILTERED AS (
            SELECT representante_dk, pess_dk, docu_dk, docu_dt_cadastro, pip_codigo, fsdc_ds_fase
            FROM {1}.tb_pip_investigados_procedimentos
            WHERE tppe_dk REGEXP '(^| )(290|7|21|317|20|14|32|345|40|5|24)(,|$)' -- apenas os investigados na contagem
            AND cod_pct IN (200, 201, 202, 203, 204, 205, 206, 207, 208, 209)
        ),
        DISTINCT_DOCS_COUNT AS (
            SELECT representante_dk, COUNT(DISTINCT docu_dk) as nr_investigacoes, MAX(docu_dt_cadastro) as max_docu_date
            FROM DOCS_INVESTIGADOS_FILTERED
            JOIN {0}.mcpr_pessoa P ON P.pess_dk = representante_dk
            WHERE P.pess_nm_pessoa NOT REGEXP 'IDENTIFICADO|IGNORAD[OA]|APURA[CÇ][AÃ]O'
            GROUP BY representante_dk
        )
        SELECT 
            pess_nm_pessoa,
            t.*,
            MULTI.flag_multipromotoria,
            TOPN.flag_top50,
            cast(substring(cast(t.pip_codigo as string), -1, 1) as int) as orgao_last_digit
        FROM (
            SELECT c.representante_dk, pip_codigo, nr_investigacoes
            FROM DISTINCT_DOCS_COUNT c
            JOIN (
                SELECT representante_dk, pip_codigo
                FROM DOCS_INVESTIGADOS_FILTERED
                GROUP BY representante_dk, pip_codigo
                HAVING SUM(CASE WHEN fsdc_ds_fase = "Em Andamento" THEN 1 ELSE 0 END) > 0
            ) r
                ON c.representante_dk = r.representante_dk) t
        LEFT JOIN (
            SELECT representante_dk, True as flag_multipromotoria
            FROM DOCS_INVESTIGADOS_FILTERED
            GROUP BY representante_dk
            HAVING COUNT(DISTINCT pip_codigo) > 1
        ) MULTI ON MULTI.representante_dk = t.representante_dk
        LEFT JOIN (
            SELECT representante_dk, True as flag_top50
            FROM DISTINCT_DOCS_COUNT
            ORDER BY nr_investigacoes DESC, max_docu_date DESC
            LIMIT 50
        ) TOPN ON TOPN.representante_dk = t.representante_dk
        JOIN {0}.mcpr_pessoa ON pess_dk = t.representante_dk
    """.format(schema_exadata, schema_exadata_aux))

    table_name_investigados = options['table_name_investigados']
    table_name = "{}.{}".format(schema_exadata_aux, table_name_investigados)
    table.repartition('orgao_last_digit').write.mode("overwrite").saveAsTable(
        "temp_table_pip_investigados")
    temp_table = spark.table("temp_table_pip_investigados")
    temp_table.repartition(10).write.mode("overwrite").partitionBy(
        'orgao_last_digit').saveAsTable(table_name)
    spark.sql("drop table temp_table_pip_investigados")

    execute_compute_stats(table_name)

    # Investigados que aparecem em documentos novos reiniciam flags no HBase
    table_name_dt_checked = options['table_name_dt_checked']
    is_exists_dt_checked = check_table_exists(spark, schema_exadata_aux,
                                              table_name_dt_checked)
    current_time = datetime.now()

    if not is_exists_dt_checked:
        new_names = spark.sql("""
            SELECT pip_codigo, collect_list(representante_dk) as representantes
            FROM {0}.tb_pip_investigados_procedimentos
            WHERE docu_dt_cadastro > '{1}'
            GROUP BY pip_codigo
        """.format(schema_exadata_aux,
                   str(current_time - timedelta(minutes=20)))).collect()
    else:
        new_names = spark.sql("""
            SELECT pip_codigo, collect_list(representante_dk) as representantes
            FROM {0}.tb_pip_investigados_procedimentos
            JOIN {0}.dt_checked_investigados
            WHERE docu_dt_cadastro > dt_ultima_verificacao
            GROUP BY pip_codigo
        """.format(schema_exadata_aux)).collect()

    conn = KerberosConnection(
        'bda1node05.pgj.rj.gov.br',
        timeout=3000,
        use_kerberos=True,
        protocol="compact",
    )
    #conn = Connection('bda1node05.pgj.rj.gov.br')
    t = conn.table('{}:pip_investigados_flags'.format(schema_hbase))
    for orgao in new_names:
        orgao_id = str(orgao['pip_codigo'])
        representantes = orgao['representantes']
        removed_rows = t.scan(
            row_prefix=orgao_id,
            filter=("DependentColumnFilter ('flags', 'is_removed')"))
        for row in removed_rows:
            if row[1].get("identificacao:representante_dk") and row[1][
                    "identificacao:representante_dk"].decode(
                        'utf-8') in representantes:
                t.delete(row[0])

    # Usa tabela para guardar a data de ultima verificacao de novos documentos
    tb_ultima_verificacao = spark.sql("""
        SELECT '{0}' as dt_ultima_verificacao
    """.format(str(current_time)))

    table_name = "{}.{}".format(schema_exadata_aux, table_name_dt_checked)
    tb_ultima_verificacao.write.mode("overwrite").saveAsTable(table_name)

    execute_compute_stats(table_name)
Exemple #9
0
 def batch_send(self, li, tab_name, row_key, item_data, batch_size):
     li.append((row_key, item_data))
     #print(len(li))
     if len(li) >= batch_size:
         try:
             connection = KerberosConnection(
                 HBASE_HOST,
                 protocol='compact',
                 use_kerberos=True,
                 table_prefix=HBASE_TABLE_PREFIX,
                 table_prefix_separator=HBASE_TABLE_PREFIX_SEPARATOR,
                 sasl_service_name='hbase')
             tab = connection.table(tab_name)
             batch = tab.batch()
             for r_k, i_d in li:
                 batch.put(r_k, i_d)
             batch.send()
             connection.close()
         except Exception as e:
             connection = KerberosConnection(
                 HBASE_HOST,
                 protocol='compact',
                 use_kerberos=True,
                 table_prefix=HBASE_TABLE_PREFIX,
                 table_prefix_separator=HBASE_TABLE_PREFIX_SEPARATOR,
                 sasl_service_name='hbase')
             schema = {'info': dict(max_versions=3)}
             connection.create_table(tab_name, schema)
             tab = connection.table(tab_name)
             batch = tab.batch()
             for r_k, i_d in li:
                 batch.put(r_k, i_d)
             batch.send()
             connection.close()
         finally:
             del li[:]