Python fingerprint Examples

Programming Language: Python

Namespace/Package Name: optimus.ml.keycollision

Method/Function: fingerprint

Examples at hotexamples.com: 7

Python fingerprint - 7 examples found. These are the top rated real world Python examples of optimus.ml.keycollision.fingerprint extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def levenshtein_matrix(df, input_col):
    """
    Create a couple of column with all the string combination
    :param df: Spark Dataframe
    :param input_col:
    :return:
    """
    df = keycollision.fingerprint(df, input_col)
    # df.table()
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)
    distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE)

    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"

    # Prepare the columns to calculate the cross join
    df = df.select(
        F.col(fingerprint_col).alias(temp_col_1),
        F.col(fingerprint_col).alias(temp_col_2)).distinct()

    #  Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    if Optimus.cache:
        df = df.cache()

    return df

Example #2

Show file

File: distancecluster.py Project: zhuanglineu/Optimus

def levenshtein_cluster(df, col_name):
    """
    Return a dataframe with a string of cluster related to a string
    :param df:
    :param col_name:
    :return:
    """
    # Prepare a group so we don need to apply the fingerprint to the whole data set
    df = df.select(col_name).groupby(col_name).agg(
        F.count(col_name).alias("count"))
    df = keycollision.fingerprint(df, col_name)

    df_t = df.groupby(col_name + "_FINGERPRINT").agg(
        F.collect_list(col_name).alias("cluster"),
        F.size(F.collect_list(col_name)).alias("cluster_size"),
        F.first(col_name).alias("recommended"),
        F.sum("count").alias("count")).repartition(1)

    # Filter nearest string
    df_l = levenshtein_filter(df, col_name).repartition(1)

    # Create Cluster
    df_l = df_l.join(df_t, (df_l[col_name + "_FROM"] == df_t[col_name + "_FINGERPRINT"]), how="left") \
        .cols.drop(col_name + "_FINGERPRINT") \
        .cols.drop([col_name + "_FROM", col_name + "_TO", col_name + "_LEVENSHTEIN_DISTANCE"])

    return df_l

Example #3

Show file

File: distancecluster.py Project: zhuanglineu/Optimus

def levenshtein_matrix(df, col_name):
    """
    Create a couple of column with all the string combination
    :param df:
    :param col_name:
    :return:
    """
    df = keycollision.fingerprint(df, col_name)

    col_fingerprint = col_name + "_FINGERPRINT"
    col_distance = col_name + "_LEVENSHTEIN_DISTANCE"

    temp_col_1 = col_name + "_LEVENSHTEIN_1"
    temp_col_2 = col_name + "_LEVENSHTEIN_2"

    # Prepare the columns to calculate the cross join
    df = df.select(col_fingerprint).distinct().select(
        F.col(col_fingerprint).alias(temp_col_1),
        F.col(col_fingerprint).alias(temp_col_2))

    #  Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(col_distance, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    return df

Example #4

Show file

def levenshtein_cluster(df, input_col):
    """
    Return a dataframe with a string of cluster related to a string
    :param df:
    :param input_col:
    :return:
    """
    # Prepare a group so we don need to apply the fingerprint to the whole data set
    df = df.select(input_col).groupby(input_col).agg(F.count(input_col).alias("count"))
    df = keycollision.fingerprint(df, input_col)

    count_col = name_col(input_col, COUNT_COL)
    cluster_col = name_col(input_col, CLUSTER_COL)
    recommended_col = name_col(input_col, RECOMMENDED_COL)
    cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL)
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)

    df_t = df.groupby(fingerprint_col).agg(F.collect_list(input_col).alias(cluster_col),
                                           F.size(F.collect_list(input_col)).alias(cluster_size_col),
                                           F.first(input_col).alias(recommended_col),
                                           F.sum("count").alias(count_col)).repartition(1)

    # Filter nearest string
    df_l = levenshtein_filter(df, input_col).repartition(1)

    # Create Cluster
    df_l = df_l.join(df_t, (df_l[input_col + "_FROM"] == df_t[fingerprint_col]), how="left") \
        .cols.drop(fingerprint_col) \
        .cols.drop([input_col + "_FROM", input_col + "_TO", input_col + "_LEVENSHTEIN_DISTANCE"])

    return df_l

Example #5

Show file

	def test_fingerprint():
		actual_df =keyCol.fingerprint(source_df,'STATE')
		expected_df = op.create.df([('LOCNCODE', StringType(), True),('LOCNDSCR', StringType(), True),('ADDRESS1', StringType(), True),('ADDRESS2', StringType(), True),('ADDRESS3', StringType(), True),('CITY', StringType(), True),('STATE', StringType(), True),('ZIPCODE', StringType(), True),('COUNTRY', StringType(), True),('Location_Segment', StringType(), True),('PAQ', StringType(), True),('TIPUNI', StringType(), True),('Tipo_unidad', StringType(), True),('ITEMNMBR', StringType(), True),('ITMSHNAM', StringType(), True),('MZ', StringType(), True),('LT', StringType(), True),('EDIF', StringType(), True),('NIVEL', StringType(), True),('NOUNI', StringType(), True),('CONDO', StringType(), True),('REGIMEN', StringType(), True),('ETAPA', StringType(), True),('PROTO', StringType(), True),('ITEMDESC', StringType(), True),('NIVELES', StringType(), True),('COCHERA', StringType(), True),('RECAM', StringType(), True),('ALCOB', StringType(), True),('BANOS', StringType(), True),('Num_Balcon', StringType(), True),('SALA', StringType(), True),('COMEDOR', StringType(), True),('COCINA', StringType(), True),('Cuarto_Lavado', StringType(), True),('Cuarto_Servicio', StringType(), True),('OTROX', StringType(), True),('OTROX1', StringType(), True),('SupCons', StringType(), True),('PATIOSERV', StringType(), True),('TERRAZA', StringType(), True),('BALCON', StringType(), True),('AZOTEA', StringType(), True),('Otros', StringType(), True),('AREATOT', StringType(), True),('FRENTE', StringType(), True),('Sup_Terreno', StringType(), True),('EXCEDENTE', StringType(), True),('OTRO1', StringType(), True),('OTRO2', StringType(), True),('TAMANO', StringType(), True),('UBICAVER', StringType(), True),('UBICAHORI', StringType(), True),('QTYONHND_', StringType(), True),('QTYSOLD', StringType(), True),('INACTIVE', StringType(), True),('UOMPRICE', StringType(), True),('MONTOAPA', StringType(), True),('PAGINI', StringType(), True),('ENGANCHE', StringType(), True),('FECHESCRIPRO', StringType(), True),('FECHAENTREGA', StringType(), True),('FECHASALIDAVENTAS', StringType(), True),('LIBERADO_NOLIBERADO', StringType(), True),('ACTIVO_INACTIVO', StringType(), True),('Estatus1Vivienda', StringType(), True),('Estatus2Vivienda', StringType(), True),('CUSTNMBR', StringType(), True),('Nombre_Completo', StringType(), True),('cNombre', StringType(), True),('cApellidoPaterno', StringType(), True),('cApellidoMaterno', StringType(), True),('cRfc', StringType(), True),('cCurp', StringType(), True),('fkIdGradoInteres', StringType(), True),('cSexo', StringType(), True),('cEmail', StringType(), True),('cTelefonoCasa', StringType(), True),('cTelefonoCelular', StringType(), True),('cTelefonoTrabajo', StringType(), True),('cNumeroSeguroSocial', StringType(), True),('dFechaNacimiento', StringType(), True),('cEstadoCivil', StringType(), True),('cRegimenConyugal', StringType(), True),('cNacionalidad', StringType(), True),('cLugarNacimiento', StringType(), True),('cRecomendadoPor', StringType(), True),('fkIdMedio', StringType(), True),('cMedioContacto', StringType(), True),('cCalle', StringType(), True),('cNumeroExterior', StringType(), True),('cNumeroInterior', StringType(), True),('cColonia', StringType(), True),('cMunicipio', StringType(), True),('cEstado', StringType(), True),('cPais', StringType(), True),('cCodigoPostal', StringType(), True),('nTiempoResidencia', StringType(), True),('cComentario', StringType(), True),('cNumeroIdentificacion', StringType(), True),('cTipoIdentificación', StringType(), True),('REFERENCIA', StringType(), True),('FACTURA', StringType(), True),('NOTACR', StringType(), True),('Precio_cierre', StringType(), True),('Precio_cierre_Tot', StringType(), True),('Aumento_al_Contrato', StringType(), True),('Condonacón', StringType(), True),('Precio_Escritura_Total', StringType(), True),('Precio_Dev', StringType(), True),('Precio_Dev_Total', StringType(), True),('Notarios_Proyectados', StringType(), True),('Gatos_A_terceros', StringType(), True),('Depositos', StringType(), True),('Saldo', StringType(), True),('dFechaCreacion', StringType(), True),('dFechaModificacion', StringType(), True),('FECHA_Cotizado', StringType(), True),('FECHA_SolApartado', StringType(), True),('FECHA_AutApartado', StringType(), True),('Vigencia_Apartado', StringType(), True),('FechaVencimientoApartado', StringType(), True),('FECHA_SolDictamen', StringType(), True),('FECHA_ProcDictamen', StringType(), True),('FECHA_DictaminadoLlamada', StringType(), True),('FECHA_DictaminadoFirma', StringType(), True),('FECHA_Dictaminado', StringType(), True),('FECHA_Rechazado', StringType(), True),('FECHA_EscrituraAvaluo', StringType(), True),('FECHA_EscrituraFolio', StringType(), True),('FolioEscsritura', StringType(), True),('FECHA_EscrituraReal', StringType(), True),('FECHA_Cancelado', StringType(), True),('FECHA_Liberado', StringType(), True),('FECHA_Entregado', StringType(), True),('MotivoCancelacion', StringType(), True),('STATE***FINGERPRINT', StringType(), True)], [('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV008', 'ALVCDEY0080', None, None, None, None, '008', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV021', 'ALVCDEY0690', None, None, None, None, '069', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV022', 'ALVCDEY0710', None, None, None, None, '071', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV027', 'ALVCDEY0810', None, None, None, None, '081', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV032', 'ALVCEEY0090', None, None, None, None, '009', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV035', 'ALVCEEY0150', None, None, None, None, '015', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV009', 'ALVCDEY0100', None, None, None, None, '010', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV012', 'ALVCDEY0160', None, None, None, None, '016', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV019', 'ALVCDEY0650', None, None, None, None, '065', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV044', 'ALVCUEY0340', None, None, None, None, '034', None, '0', '0', 'EST CUEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal')])
		assert (expected_df.collect() == actual_df.collect())

Example #6

Show file

def levenshtein_json(df, input_col):
    """
    Output the levenshtein distance in json format
    :param df: Spark Dataframe
    :param input_col:
    :return:
    """
    df = keycollision.fingerprint(df, input_col)
    # df.table()
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)
    distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE)

    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"

    # Prepare the columns to calculate the cross join
    result = df.select(input_col,
                       F.col(fingerprint_col).alias(temp_col_1)).distinct()

    df = df.select(input_col,
                   F.col(fingerprint_col).alias(temp_col_1),
                   F.col(fingerprint_col).alias(temp_col_2)).distinct()

    # Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    # if Optimus.cache:
    #     df = df.cache()

    # Select only the string with shortest path
    distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE)
    distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R"
    temp_r = "TEMP_R"

    df_r = (df.rows.drop(F.col(distance_col) == 0).groupby(temp_col_1).agg(
        F.min(distance_col).alias(distance_r_col)).cols.rename(
            temp_col_1, temp_r)).repartition(1)

    df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \
        .select(temp_col_1, distance_col, temp_col_2).repartition(1)

    # Create the clusters/lists

    df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2)))

    kv_dict = {}
    for row in result.collect():
        _row = list(row.asDict().values())
        kv_dict[_row[1]] = _row[0]

    kv_result_df = {}
    for row in df.collect():
        _row = list(row.asDict().values())
        kv_result_df[_row[0]] = _row[1]

    result = {}
    for k, v in kv_result_df.items():
        a = result[kv_dict[k]] = []
        for iv in v:
            a.append(kv_dict[iv])

    return result

Example #7

Show file

File: distancecluster.py Project: zhangyeejia/Optimus

def levenshtein_cluster(df,
                        input_col,
                        threshold: int = None,
                        output: str = "dict"):
    """
    Output the levenshtein distance in json format
    :param df: Spark Dataframe
    :param input_col: Column to be processed
    :param threshold: number
    :param output: "dict" or "json"
    :return:
    """
    # Create fingerprint
    df_fingerprint = keycollision.fingerprint(df, input_col)

    # Names
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)
    distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE)
    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"
    count = "count"

    # Prepare the columns to calculate the cross join
    fingerprint_count = df_fingerprint.select(input_col, fingerprint_col).groupby(input_col) \
        .agg(F.first(input_col).alias(temp_col_1), F.first(fingerprint_col).alias(temp_col_2),
             F.count(input_col).alias(count)) \
        .select(temp_col_1, temp_col_2, count).collect()

    df = df_fingerprint.select(
        input_col,
        F.col(fingerprint_col).alias(temp_col_1),
        F.col(fingerprint_col).alias(temp_col_2)).distinct()

    # Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    # Select only the string with shortest path
    distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE)
    distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R"
    temp_r = "TEMP_R"

    if threshold is None:
        where = ((F.col(distance_col) == 0) &
                 (F.col(temp_col_1) != F.col(temp_col_2)))
    else:
        where = (F.col(distance_col) == 0) | (F.col(distance_col) > threshold)

    df_r = (
        df.rows.drop(where).cols.replace(
            distance_col, 0, None,
            search_by="numeric").groupby(temp_col_1).agg(
                F.min(distance_col).alias(distance_r_col))
        # .cols.rename(distance_col, distance_r_col)
        .cols.rename(temp_col_1, temp_r)).repartition(1)

    df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \
        .select(temp_col_1, distance_col, temp_col_2).repartition(1)

    # Create the clusters/lists
    df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2),
                                     F.count(temp_col_2)))

    # Replace ngram per string
    kv_dict = {}
    for row in fingerprint_count:
        _row = list(row.asDict().values())
        kv_dict[_row[1]] = {_row[0]: _row[2]}

    result = {}
    for row in df.collect():
        _row = list(row.asDict().values())
        d = {}
        for i in _row[1]:
            key = list(kv_dict[i].keys())[0]
            value = list(kv_dict[i].values())[0]
            d[key] = value
        key = list(kv_dict[_row[0]].keys())[0]
        value = list(kv_dict[_row[0]].values())[0]
        d.update({key: value})
        result[key] = d

    # Calculate count and sum
    f = {}
    for k, v in result.items():
        _sum = 0
        for x, y in v.items():
            _sum = _sum + y
        f[k] = {"similar": v, "count": len(v), "sum": _sum}

    result = f
    if output == "json":
        result = dump_json(result)
    return result