Exemple #1
0
def levenshtein_matrix(df, input_col):
    """
    Create a couple of column with all the string combination
    :param df: Spark Dataframe
    :param input_col:
    :return:
    """
    df = keycollision.fingerprint(df, input_col)
    # df.table()
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)
    distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE)

    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"

    # Prepare the columns to calculate the cross join
    df = df.select(
        F.col(fingerprint_col).alias(temp_col_1),
        F.col(fingerprint_col).alias(temp_col_2)).distinct()

    #  Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    if Optimus.cache:
        df = df.cache()

    return df
def levenshtein_cluster(df, col_name):
    """
    Return a dataframe with a string of cluster related to a string
    :param df:
    :param col_name:
    :return:
    """
    # Prepare a group so we don need to apply the fingerprint to the whole data set
    df = df.select(col_name).groupby(col_name).agg(
        F.count(col_name).alias("count"))
    df = keycollision.fingerprint(df, col_name)

    df_t = df.groupby(col_name + "_FINGERPRINT").agg(
        F.collect_list(col_name).alias("cluster"),
        F.size(F.collect_list(col_name)).alias("cluster_size"),
        F.first(col_name).alias("recommended"),
        F.sum("count").alias("count")).repartition(1)

    # Filter nearest string
    df_l = levenshtein_filter(df, col_name).repartition(1)

    # Create Cluster
    df_l = df_l.join(df_t, (df_l[col_name + "_FROM"] == df_t[col_name + "_FINGERPRINT"]), how="left") \
        .cols.drop(col_name + "_FINGERPRINT") \
        .cols.drop([col_name + "_FROM", col_name + "_TO", col_name + "_LEVENSHTEIN_DISTANCE"])

    return df_l
def levenshtein_matrix(df, col_name):
    """
    Create a couple of column with all the string combination
    :param df:
    :param col_name:
    :return:
    """
    df = keycollision.fingerprint(df, col_name)

    col_fingerprint = col_name + "_FINGERPRINT"
    col_distance = col_name + "_LEVENSHTEIN_DISTANCE"

    temp_col_1 = col_name + "_LEVENSHTEIN_1"
    temp_col_2 = col_name + "_LEVENSHTEIN_2"

    # Prepare the columns to calculate the cross join
    df = df.select(col_fingerprint).distinct().select(
        F.col(col_fingerprint).alias(temp_col_1),
        F.col(col_fingerprint).alias(temp_col_2))

    #  Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(col_distance, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    return df
Exemple #4
0
def levenshtein_cluster(df, input_col):
    """
    Return a dataframe with a string of cluster related to a string
    :param df:
    :param input_col:
    :return:
    """
    # Prepare a group so we don need to apply the fingerprint to the whole data set
    df = df.select(input_col).groupby(input_col).agg(F.count(input_col).alias("count"))
    df = keycollision.fingerprint(df, input_col)

    count_col = name_col(input_col, COUNT_COL)
    cluster_col = name_col(input_col, CLUSTER_COL)
    recommended_col = name_col(input_col, RECOMMENDED_COL)
    cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL)
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)

    df_t = df.groupby(fingerprint_col).agg(F.collect_list(input_col).alias(cluster_col),
                                           F.size(F.collect_list(input_col)).alias(cluster_size_col),
                                           F.first(input_col).alias(recommended_col),
                                           F.sum("count").alias(count_col)).repartition(1)

    # Filter nearest string
    df_l = levenshtein_filter(df, input_col).repartition(1)

    # Create Cluster
    df_l = df_l.join(df_t, (df_l[input_col + "_FROM"] == df_t[fingerprint_col]), how="left") \
        .cols.drop(fingerprint_col) \
        .cols.drop([input_col + "_FROM", input_col + "_TO", input_col + "_LEVENSHTEIN_DISTANCE"])

    return df_l
Exemple #5
0
	def test_fingerprint():
		actual_df =keyCol.fingerprint(source_df,'STATE')
		expected_df = op.create.df([('LOCNCODE', StringType(), True),('LOCNDSCR', StringType(), True),('ADDRESS1', StringType(), True),('ADDRESS2', StringType(), True),('ADDRESS3', StringType(), True),('CITY', StringType(), True),('STATE', StringType(), True),('ZIPCODE', StringType(), True),('COUNTRY', StringType(), True),('Location_Segment', StringType(), True),('PAQ', StringType(), True),('TIPUNI', StringType(), True),('Tipo_unidad', StringType(), True),('ITEMNMBR', StringType(), True),('ITMSHNAM', StringType(), True),('MZ', StringType(), True),('LT', StringType(), True),('EDIF', StringType(), True),('NIVEL', StringType(), True),('NOUNI', StringType(), True),('CONDO', StringType(), True),('REGIMEN', StringType(), True),('ETAPA', StringType(), True),('PROTO', StringType(), True),('ITEMDESC', StringType(), True),('NIVELES', StringType(), True),('COCHERA', StringType(), True),('RECAM', StringType(), True),('ALCOB', StringType(), True),('BANOS', StringType(), True),('Num_Balcon', StringType(), True),('SALA', StringType(), True),('COMEDOR', StringType(), True),('COCINA', StringType(), True),('Cuarto_Lavado', StringType(), True),('Cuarto_Servicio', StringType(), True),('OTROX', StringType(), True),('OTROX1', StringType(), True),('SupCons', StringType(), True),('PATIOSERV', StringType(), True),('TERRAZA', StringType(), True),('BALCON', StringType(), True),('AZOTEA', StringType(), True),('Otros', StringType(), True),('AREATOT', StringType(), True),('FRENTE', StringType(), True),('Sup_Terreno', StringType(), True),('EXCEDENTE', StringType(), True),('OTRO1', StringType(), True),('OTRO2', StringType(), True),('TAMANO', StringType(), True),('UBICAVER', StringType(), True),('UBICAHORI', StringType(), True),('QTYONHND_', StringType(), True),('QTYSOLD', StringType(), True),('INACTIVE', StringType(), True),('UOMPRICE', StringType(), True),('MONTOAPA', StringType(), True),('PAGINI', StringType(), True),('ENGANCHE', StringType(), True),('FECHESCRIPRO', StringType(), True),('FECHAENTREGA', StringType(), True),('FECHASALIDAVENTAS', StringType(), True),('LIBERADO_NOLIBERADO', StringType(), True),('ACTIVO_INACTIVO', StringType(), True),('Estatus1Vivienda', StringType(), True),('Estatus2Vivienda', StringType(), True),('CUSTNMBR', StringType(), True),('Nombre_Completo', StringType(), True),('cNombre', StringType(), True),('cApellidoPaterno', StringType(), True),('cApellidoMaterno', StringType(), True),('cRfc', StringType(), True),('cCurp', StringType(), True),('fkIdGradoInteres', StringType(), True),('cSexo', StringType(), True),('cEmail', StringType(), True),('cTelefonoCasa', StringType(), True),('cTelefonoCelular', StringType(), True),('cTelefonoTrabajo', StringType(), True),('cNumeroSeguroSocial', StringType(), True),('dFechaNacimiento', StringType(), True),('cEstadoCivil', StringType(), True),('cRegimenConyugal', StringType(), True),('cNacionalidad', StringType(), True),('cLugarNacimiento', StringType(), True),('cRecomendadoPor', StringType(), True),('fkIdMedio', StringType(), True),('cMedioContacto', StringType(), True),('cCalle', StringType(), True),('cNumeroExterior', StringType(), True),('cNumeroInterior', StringType(), True),('cColonia', StringType(), True),('cMunicipio', StringType(), True),('cEstado', StringType(), True),('cPais', StringType(), True),('cCodigoPostal', StringType(), True),('nTiempoResidencia', StringType(), True),('cComentario', StringType(), True),('cNumeroIdentificacion', StringType(), True),('cTipoIdentificación', StringType(), True),('REFERENCIA', StringType(), True),('FACTURA', StringType(), True),('NOTACR', StringType(), True),('Precio_cierre', StringType(), True),('Precio_cierre_Tot', StringType(), True),('Aumento_al_Contrato', StringType(), True),('Condonacón', StringType(), True),('Precio_Escritura_Total', StringType(), True),('Precio_Dev', StringType(), True),('Precio_Dev_Total', StringType(), True),('Notarios_Proyectados', StringType(), True),('Gatos_A_terceros', StringType(), True),('Depositos', StringType(), True),('Saldo', StringType(), True),('dFechaCreacion', StringType(), True),('dFechaModificacion', StringType(), True),('FECHA_Cotizado', StringType(), True),('FECHA_SolApartado', StringType(), True),('FECHA_AutApartado', StringType(), True),('Vigencia_Apartado', StringType(), True),('FechaVencimientoApartado', StringType(), True),('FECHA_SolDictamen', StringType(), True),('FECHA_ProcDictamen', StringType(), True),('FECHA_DictaminadoLlamada', StringType(), True),('FECHA_DictaminadoFirma', StringType(), True),('FECHA_Dictaminado', StringType(), True),('FECHA_Rechazado', StringType(), True),('FECHA_EscrituraAvaluo', StringType(), True),('FECHA_EscrituraFolio', StringType(), True),('FolioEscsritura', StringType(), True),('FECHA_EscrituraReal', StringType(), True),('FECHA_Cancelado', StringType(), True),('FECHA_Liberado', StringType(), True),('FECHA_Entregado', StringType(), True),('MotivoCancelacion', StringType(), True),('STATE***FINGERPRINT', StringType(), True)], [('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV008', 'ALVCDEY0080', None, None, None, None, '008', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV021', 'ALVCDEY0690', None, None, None, None, '069', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV022', 'ALVCDEY0710', None, None, None, None, '071', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV027', 'ALVCDEY0810', None, None, None, None, '081', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV032', 'ALVCEEY0090', None, None, None, None, '009', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV035', 'ALVCEEY0150', None, None, None, None, '015', None, '0', '0', 'EST CEEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV009', 'ALVCDEY0100', None, None, None, None, '010', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV012', 'ALVCDEY0160', None, None, None, None, '016', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV019', 'ALVCDEY0650', None, None, None, None, '065', None, '0', '0', 'EST CDEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal'), ('ALV', 'Altos Lindavista', 'Guanajuato # 85', None, 'San Bartolo Atepehuacan', 'Gustavo A. Madero', 'Distrito Federal', '07730', 'Mexico', '0531', None, '2', 'ESTACIONAMIENTO', 'ALVV044', 'ALVCUEY0340', None, None, None, None, '034', None, '0', '0', 'EST CUEY', 'Cajon virtual', None, None, None, None, None, None, None, None, None, None, None, None, None, '2.2', None, None, None, None, None, None, '2.4', None, '0', None, None, 'Chico', 'Cajon virtual', 'Cajon virtual', '0', '0', '1', '0', None, None, None, None, None, None, 'NO LIBERADO', 'INACTIVO', 'DISPONIBLE', '000-DISPONIBLE', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', '.00000', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'distritofederal')])
		assert (expected_df.collect() == actual_df.collect())
Exemple #6
0
def levenshtein_json(df, input_col):
    """
    Output the levenshtein distance in json format
    :param df: Spark Dataframe
    :param input_col:
    :return:
    """
    df = keycollision.fingerprint(df, input_col)
    # df.table()
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)
    distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE)

    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"

    # Prepare the columns to calculate the cross join
    result = df.select(input_col,
                       F.col(fingerprint_col).alias(temp_col_1)).distinct()

    df = df.select(input_col,
                   F.col(fingerprint_col).alias(temp_col_1),
                   F.col(fingerprint_col).alias(temp_col_2)).distinct()

    # Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    # if Optimus.cache:
    #     df = df.cache()

    # Select only the string with shortest path
    distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE)
    distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R"
    temp_r = "TEMP_R"

    df_r = (df.rows.drop(F.col(distance_col) == 0).groupby(temp_col_1).agg(
        F.min(distance_col).alias(distance_r_col)).cols.rename(
            temp_col_1, temp_r)).repartition(1)

    df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \
        .select(temp_col_1, distance_col, temp_col_2).repartition(1)

    # Create the clusters/lists

    df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2)))

    kv_dict = {}
    for row in result.collect():
        _row = list(row.asDict().values())
        kv_dict[_row[1]] = _row[0]

    kv_result_df = {}
    for row in df.collect():
        _row = list(row.asDict().values())
        kv_result_df[_row[0]] = _row[1]

    result = {}
    for k, v in kv_result_df.items():
        a = result[kv_dict[k]] = []
        for iv in v:
            a.append(kv_dict[iv])

    return result
def levenshtein_cluster(df,
                        input_col,
                        threshold: int = None,
                        output: str = "dict"):
    """
    Output the levenshtein distance in json format
    :param df: Spark Dataframe
    :param input_col: Column to be processed
    :param threshold: number
    :param output: "dict" or "json"
    :return:
    """
    # Create fingerprint
    df_fingerprint = keycollision.fingerprint(df, input_col)

    # Names
    fingerprint_col = name_col(input_col, FINGERPRINT_COL)
    distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE)
    temp_col_1 = input_col + "_LEVENSHTEIN_1"
    temp_col_2 = input_col + "_LEVENSHTEIN_2"
    count = "count"

    # Prepare the columns to calculate the cross join
    fingerprint_count = df_fingerprint.select(input_col, fingerprint_col).groupby(input_col) \
        .agg(F.first(input_col).alias(temp_col_1), F.first(fingerprint_col).alias(temp_col_2),
             F.count(input_col).alias(count)) \
        .select(temp_col_1, temp_col_2, count).collect()

    df = df_fingerprint.select(
        input_col,
        F.col(fingerprint_col).alias(temp_col_1),
        F.col(fingerprint_col).alias(temp_col_2)).distinct()

    # Create all the combination between the string to calculate the levenshtein distance
    df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \
        .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2)))

    # Select only the string with shortest path
    distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE)
    distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R"
    temp_r = "TEMP_R"

    if threshold is None:
        where = ((F.col(distance_col) == 0) &
                 (F.col(temp_col_1) != F.col(temp_col_2)))
    else:
        where = (F.col(distance_col) == 0) | (F.col(distance_col) > threshold)

    df_r = (
        df.rows.drop(where).cols.replace(
            distance_col, 0, None,
            search_by="numeric").groupby(temp_col_1).agg(
                F.min(distance_col).alias(distance_r_col))
        # .cols.rename(distance_col, distance_r_col)
        .cols.rename(temp_col_1, temp_r)).repartition(1)

    df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \
        .select(temp_col_1, distance_col, temp_col_2).repartition(1)

    # Create the clusters/lists
    df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2),
                                     F.count(temp_col_2)))

    # Replace ngram per string
    kv_dict = {}
    for row in fingerprint_count:
        _row = list(row.asDict().values())
        kv_dict[_row[1]] = {_row[0]: _row[2]}

    result = {}
    for row in df.collect():
        _row = list(row.asDict().values())
        d = {}
        for i in _row[1]:
            key = list(kv_dict[i].keys())[0]
            value = list(kv_dict[i].values())[0]
            d[key] = value
        key = list(kv_dict[_row[0]].keys())[0]
        value = list(kv_dict[_row[0]].values())[0]
        d.update({key: value})
        result[key] = d

    # Calculate count and sum
    f = {}
    for k, v in result.items():
        _sum = 0
        for x, y in v.items():
            _sum = _sum + y
        f[k] = {"similar": v, "count": len(v), "sum": _sum}

    result = f
    if output == "json":
        result = dump_json(result)
    return result