Example #1
0
def main(visitas_com_conversao,
         saida,
         data_inicial,
         data_final,
         departamentos,
         scaler=None):

    departamentos_lista = [
        departamento.strip() for departamento in departamentos.split(",")
    ]
    print(departamentos_lista)
    result = prepare_dataframe(departamentos_lista, visitas_com_conversao,
                               data_inicial, data_final)

    # Faz a escala dos valores
    if scaler in LIST_SCALER.keys():
        result = transform(result, LIST_SCALER[scaler])
        saida = os.path.join(saida, scaler)
        print(f"Scaling with {scaler}")

        # salva resultado
        save_partitioned(result, saida, ['data', 'hora'])
        print(f'saved ok')
    elif scaler is None:
        result = prepare_features(result)
        saida = os.path.join(saida, "sem_normalizar")
        print(f"Preparing data without scaling")

        # salva resultado
        save_partitioned(result, saida, ['data', 'hora'])
        print(f'saved ok')
    else:
        print(f"Error: Input correct key or None for original data")
Example #2
0
def main(dataset: str, saida: str, data_inicial, data_final, samples: int):

    filter_function = partial(filter_date,
                              data_inicial=data_inicial,
                              data_final=data_final)
    print(dataset)
    df = read_partitioned_json(file_path=dataset,
                               filter_function=filter_function)
    print(df)

    vector = np.asarray(list(df['features'].to_numpy()))

    labels = optics(vector, samples)

    clust_quantity = set(labels)

    print(clust_quantity)
    print(len(clust_quantity))
    print(f"saving clusters...")

    df['cluster_label'] = list(labels)

    print(f"Saving partitioned by cluster first..")
    save_partitioned(df, os.path.join(saida, "by_cluster"),
                     ['cluster_label', 'data', 'hora'])
    print(f"Saving partitioned by data first..")
    save_partitioned(df, os.path.join(saida, "by_data"),
                     ['data', 'hora', 'cluster_label'])
Example #3
0
def main(dataset: str, number_of_cluster: int, saida: str, data_inicial, data_final):
    filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final)

    dataset = read_partitioned_json(file_path=dataset, filter_function=filter_function)
    vector = np.asarray(list(dataset['features'].to_numpy()))
    coordinates, labels = kmeans(vector, number_of_cluster)

    dataset['cluster_coordinate'] = list(coordinates)

    dataset['cluster_label'] = list(labels)

    save_partitioned(dataset, saida, ['data', 'hora'])
Example #4
0
def main(dataframe_path: str, saida: str, data_inicial, data_final, type):
    filter_function = partial(filter_date,
                              data_inicial=data_inicial,
                              data_final=data_final)
    dataframe = read_partitioned_json(dataframe_path,
                                      filter_function=filter_function)
    if type.lower() == "hora":
        dataframe = dataframe.groupby(["data", "hora", "cluster_label"
                                       ]).convertido.mean().reset_index()
        save_partitioned(dataframe, saida, ['data', 'hora', 'cluster_label'])
    else:
        dataframe = dataframe.groupby(["data", "cluster_label"
                                       ]).convertido.mean().reset_index()
        save_partitioned(dataframe, saida, ['data', 'cluster_label'])
Example #5
0
def main(visitas_com_conversao, saida, data_inicial, data_final,
         departamentos):
    departamentos_lista = [
        departamento.strip() for departamento in departamentos.split(",")
    ]

    result = prepare_dataframe(departamentos_lista, visitas_com_conversao,
                               data_inicial, data_final)

    # Faz a escala dos valores
    result_scaled = transform(result, Normalizer())

    # salva resultado
    save_partitioned(result_scaled, saida, ['data', 'hora'])
Example #6
0
def main(visitas_com_conversao, saida, data_inicial, data_final, departamentos):
    if departamentos.lower() == "all":
        departamentos_lista = get_departamentos_all()
    else:
        departamentos_lista = [departamento.strip() for departamento in departamentos.split(",")]

    result = prepare_dataframe(departamentos_lista, visitas_com_conversao, data_inicial, data_final)

    # Faz a escala dos valores
    result = transform(result, MinMaxScaler())

    # salva resultado numa nova pasta
    saida = os.path.join(saida, "min_max_scaler")
    os.mkdir(saida)
    save_partitioned(result, saida, ['data', 'hora'])
def main(dataset: str, number_of_cluster: int, saida: str, data_inicial, data_final, batch_size: int):

    filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final)
    dataset = read_partitioned_json(file_path=dataset, filter_function=filter_function)

    vector = np.asarray(list(dataset['features'].to_numpy()))

    labels, inertia, score = minibatch_kmeans(vector, number_of_cluster, batch_size)
    print(f"Clusters inertia: {inertia}")
    print(f"SCORE: {score}")
    dataset[f'cluster_label'] = list(labels)

    print(f"Saving partitioned by cluster first..")
    save_partitioned(dataset, os.path.join(saida,"by_cluster"), ['cluster_label', 'data', 'hora'])
    print(f"Saving partitioned by data first..")
    save_partitioned(dataset, os.path.join(saida,"by_data"), ['data', 'hora', 'cluster_label'])
Example #8
0
def main(dataframe_path: str, saida: str, data_inicial, data_final,
         cluster_method: str, scaler: str, timescale: str):

    output_file(os.path.join(saida, "serie_timescale.html"))

    figura, df = plot_series(dataframe_path,
                             data_inicial,
                             data_final,
                             cluster_method,
                             scaler,
                             timescale,
                             title="")

    print(f"Saving agregated by scale and cluster..")
    save_partitioned(df, os.path.join(saida, "series", cluster_method),
                     [timescale, 'cluster_label'])
    save(figura)
Example #9
0
def main(pedidos, visitas, produtos, saida, data_inicial, data_final):
    produtos_df = read_csv(produtos)
    produtos_df["product_id"] = produtos_df["product_id"].astype(str)

    delta: timedelta = (data_final - data_inicial)
    date_partitions = [
        data_inicial.date() + timedelta(days=days)
        for days in range(delta.days)
    ]

    for data in date_partitions:
        hour_partitions = list(range(0, 23))

        for hour in hour_partitions:
            hour_snnipet = f"hora={hour}"

            data_str = data.strftime('%Y-%m-%d')
            date_partition = f"data={data_str}"

            visitas_partition = os.path.join(visitas, date_partition,
                                             hour_snnipet)
            visitas_df = read_partitioned_json(visitas_partition)
            visitas_df["product_id"] = visitas_df["product_id"].astype(str)
            visitas_df["visit_id"] = visitas_df["visit_id"].astype(str)

            pedidos_partition = os.path.join(pedidos, date_partition,
                                             hour_snnipet)
            pedidos_df = read_partitioned_json(pedidos_partition)
            pedidos_df["visit_id"] = pedidos_df["visit_id"].astype(str)

            visita_com_produto_df = visitas_df.merge(produtos_df,
                                                     how="inner",
                                                     on="product_id",
                                                     suffixes=("", "_off"))
            visita_com_produto_e_conversao_df = visita_com_produto_df.merge(
                pedidos_df, how="left", on="visit_id", suffixes=("", "_off"))

            visita_com_produto_e_conversao_df["data"] = data_str
            visita_com_produto_e_conversao_df["hora"] = hour

            prepared = _prepare(visita_com_produto_e_conversao_df)
            save_partitioned(prepared, saida, SAVING_PARTITIONS)
            print(f"ConcluĂ­do para {date_partition} {hour}h")
Example #10
0
def main(dataset: str, number_of_cluster: int, threshold: float, saida: str,
         data_inicial, data_final):

    filter_function = partial(filter_date,
                              data_inicial=data_inicial,
                              data_final=data_final)
    dataset = read_partitioned_json(file_path=dataset,
                                    filter_function=filter_function)

    vector = np.asarray(list(dataset['features'].to_numpy()))

    labels = birch(vector, number_of_cluster, threshold)

    dataset[f'cluster_label'] = list(labels)

    print(f"Saving partitioned by cluster first..")
    save_partitioned(dataset, os.path.join(saida, "by_cluster"),
                     ['cluster_label', 'data', 'hora'])
    print(f"Saving partitioned by data first..")
    save_partitioned(dataset, os.path.join(saida, "by_data"),
                     ['data', 'hora', 'cluster_label'])
Example #11
0
def main(dataset: str, number_of_cluster: int, saida: str, data_inicial, data_final):
    filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final)

    dataset = read_partitioned_json(file_path=dataset, filter_function=filter_function)
    vector = np.asarray(list(dataset['features'].to_numpy()))
    labels = spectral_clustering(vector, number_of_cluster)

    dataset['cluster_label'] = list(labels)

    saida = os.path.join(saida, "spectral_clustering")
    os.mkdir(saida)

    by_cluster_data_hora = os.path.join(saida, "by_cluster_data_hora")
    os.mkdir(by_cluster_data_hora)

    by_data_hora_cluster = os.path.join(saida, "by_data_hora_cluster")
    os.mkdir(by_data_hora_cluster)

    by_data_hora = os.path.join(saida, "by_data_hora")
    os.mkdir(by_data_hora)
    print("Directory successfully created")

    save_partitioned(dataset, by_cluster_data_hora, ['cluster_label', 'data', 'hora'])
    save_partitioned(dataset, by_data_hora_cluster, ['data', 'hora', 'cluster_label'])
    save_partitioned(dataset, by_data_hora, ['data', 'hora'])
Example #12
0
def save_prepared(saida: str, dataframe: pd.DataFrame):
    # extracting time and hour from timestamp
    save_partitioned(dataframe, saida, SAVING_PARTITIONS)
Example #13
0
def save_prepared(saida: str, visita_com_produto_e_conversao_df: pd.DataFrame):
    prepared = _prepare(visita_com_produto_e_conversao_df)
    save_partitioned(prepared, saida, SAVING_PARTITIONS)