def create_visitas_df(date_partition: str, hour_snnipet: str, visitas: pd.DataFrame) -> pd.DataFrame: visitas_partition = os.path.join(visitas, date_partition, hour_snnipet) visitas_df = read_partitioned_json(visitas_partition) visitas_df["product_id"] = visitas_df["product_id"].astype(str) visitas_df["visit_id"] = visitas_df["visit_id"].astype(str) return visitas_df
def main(dataset: str, saida: str, data_inicial, data_final, samples: int): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) print(dataset) df = read_partitioned_json(file_path=dataset, filter_function=filter_function) print(df) vector = np.asarray(list(df['features'].to_numpy())) labels = optics(vector, samples) clust_quantity = set(labels) print(clust_quantity) print(len(clust_quantity)) print(f"saving clusters...") df['cluster_label'] = list(labels) print(f"Saving partitioned by cluster first..") save_partitioned(df, os.path.join(saida, "by_cluster"), ['cluster_label', 'data', 'hora']) print(f"Saving partitioned by data first..") save_partitioned(df, os.path.join(saida, "by_data"), ['data', 'hora', 'cluster_label'])
def main(dataset: str, number_of_cluster: int, saida: str, data_inicial, data_final): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataset = read_partitioned_json(file_path=dataset, filter_function=filter_function) vector = np.asarray(list(dataset['features'].to_numpy())) labels = spectral_clustering(vector, number_of_cluster) dataset['cluster_label'] = list(labels) saida = os.path.join(saida, "spectral_clustering") os.mkdir(saida) by_cluster_data_hora = os.path.join(saida, "by_cluster_data_hora") os.mkdir(by_cluster_data_hora) by_data_hora_cluster = os.path.join(saida, "by_data_hora_cluster") os.mkdir(by_data_hora_cluster) by_data_hora = os.path.join(saida, "by_data_hora") os.mkdir(by_data_hora) print("Directory successfully created") save_partitioned(dataset, by_cluster_data_hora, ['cluster_label', 'data', 'hora']) save_partitioned(dataset, by_data_hora_cluster, ['data', 'hora', 'cluster_label']) save_partitioned(dataset, by_data_hora, ['data', 'hora'])
def create_pedidos_df(date_partition: str, hour_snnipet: str, pedidos: str) -> pd.DataFrame: path = os.path.join(os.path.join(pedidos, date_partition), hour_snnipet) pedidos_df = read_partitioned_json(path) pedidos_df["visit_id"] = pedidos_df["visit_id"].astype(str) return pedidos_df
def plot_conv(dataframe_path, data_inicial: str, data_final: str, cluster_method: str, scaler: str, title: str): def filter_data_cluster(row): return filter_date(row, data_inicial, data_final) and filter_cluster(row) if scaler in LIST_SCALER.keys(): if os.path.isdir( os.path.join(dataframe_path, cluster_method, scaler, "by_data")): path = os.path.join(dataframe_path, cluster_method, scaler, "by_data") dataframe = read_partitioned_json( path, filter_function=filter_data_cluster) dataframe["cluster_label"] = dataframe["cluster_label"].astype(int) p = figure(y_range=(0.16, 0.20)) colors = itertools.cycle(palette) n_cluster = sorted(dataframe['cluster_label'].unique()) print(n_cluster) for n, color in zip(n_cluster, colors): df_1 = dataframe[(dataframe['cluster_label'] == n) & (dataframe['convertido'] == 1)].count() df_0 = dataframe[(dataframe['cluster_label'] == n)].count() data = df_1["convertido"] / df_0["convertido"] print(data) p.vbar(x=n_cluster, top=data, width=0.8) return p
def main(dataframe_path: str, saida: str, x_axis, y_axis, cluster_label, data_inicial, data_final): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataframe = read_partitioned_json(dataframe_path, filter_function=filter_function) expanded_cols = pd.DataFrame( dataframe["features"].values.tolist(), columns=["preco_s", "prazo_s", "frete_s", "latitude_s", "longitude_s"]) dataframe = dataframe.join(expanded_cols) output_file(saida) p1 = plot_scatter(dataframe, x_axis, y_axis, cluster_label, title="Scatter_" + x_axis + "_vs_" + y_axis + "_Original") p2 = plot_scatter(dataframe, x_axis + "_s", y_axis + "_s", cluster_label, title="Scatter_" + x_axis + "_vs_" + y_axis + "_Escalado") figura = gridplot([p1, p2], ncols=2) save(figura)
def plot_map(dataframe_path: str, data_inicial: str, data_final: str, cluster_method: str, scaler: str): def filter_cluster_data(row): return filter_cluster(row) and filter_date(row, data_inicial, data_final) if scaler in LIST_SCALER.keys(): if os.path.isdir( os.path.join(dataframe_path, cluster_method, scaler, "by_cluster")): path = os.path.join(dataframe_path, cluster_method, scaler, "by_cluster") dataframe = read_partitioned_json( path, filter_function=filter_cluster_data) dataframe["cluster_label"] = dataframe["cluster_label"].astype(int) p = plot_scatter(dataframe, "longitude", "latitude", "cluster_label", title="Map_clusters_" + cluster_method + "_method_" + scaler + "_data") return p
def plot_series(dataframe_path, data_inicial: str, data_final: str, cluster_method: str, scaler: str, timescale: str, title: str): def filter_data_cluster(row): return filter_date(row, data_inicial, data_final) and filter_cluster(row) if scaler in LIST_SCALER.keys(): if os.path.isdir( os.path.join(dataframe_path, cluster_method, scaler, "by_data")): path = os.path.join(dataframe_path, cluster_method, scaler, "by_data") dataframe = read_partitioned_json( path, filter_function=filter_data_cluster) dataframe["cluster_label"] = dataframe["cluster_label"].astype(int) dataframe['dia'] = [ d.split(" ")[0].split("-")[-1] for d in dataframe["datahora"] ] dataframe['hora'] = [ d.split(" ")[-1].split(":")[0] for d in dataframe["datahora"] ] dataframe['minuto'] = [ d.split(" ")[-1].split(":")[1] for d in dataframe["datahora"] ] n_cluster = sorted(dataframe['cluster_label'].unique()) TOOLS = 'crosshair,save,pan,box_zoom,reset,wheel_zoom' p = figure(title=title, y_axis_type="linear", tools=TOOLS) colors = itertools.cycle(palette) df, df_temp = pd.DataFrame(), pd.DataFrame() for n, color in zip(n_cluster, colors): x_axis = sorted(dataframe[timescale].unique()) df_1 = dataframe[(dataframe['cluster_label'] == n) & (dataframe['convertido'] == 1)].groupby( by=[timescale]).count().sort_values(by=timescale) df_0 = dataframe[(dataframe['cluster_label'] == n)].groupby( by=[timescale]).count().sort_values(by=timescale) cv = df_1["convertido"] / df_0["convertido"] #print(x_axis, cv) df_temp[timescale] = x_axis df_temp['conversao'] = cv.values df_temp['cluster_label'] = n df = pd.concat([df, df_temp], ignore_index=True) p.line(x_axis, cv, legend_label=str(n), color=palette[n], line_width=3) p.legend.location = "top_left" p.xaxis.axis_label = timescale p.yaxis.axis_label = 'Conversao' return p, df
def main(pedidos, visitas, produtos, saida, data_inicial, data_final): produtos_df = read_csv(produtos) produtos_df["product_id"] = produtos_df["product_id"].astype(str) delta: timedelta = (data_final - data_inicial) date_partitions = [ data_inicial.date() + timedelta(days=days) for days in range(delta.days) ] for data in date_partitions: hour_partitions = list(range(0, 23)) for hour in hour_partitions: hour_snnipet = f"hora={hour}" data_str = data.strftime('%Y-%m-%d') date_partition = f"data={data_str}" visitas_partition = os.path.join(visitas, date_partition, hour_snnipet) visitas_df = read_partitioned_json(visitas_partition) visitas_df["product_id"] = visitas_df["product_id"].astype(str) visitas_df["visit_id"] = visitas_df["visit_id"].astype(str) pedidos_partition = os.path.join(pedidos, date_partition, hour_snnipet) pedidos_df = read_partitioned_json(pedidos_partition) pedidos_df["visit_id"] = pedidos_df["visit_id"].astype(str) visita_com_produto_df = visitas_df.merge(produtos_df, how="inner", on="product_id", suffixes=("", "_off")) visita_com_produto_e_conversao_df = visita_com_produto_df.merge( pedidos_df, how="left", on="visit_id", suffixes=("", "_off")) visita_com_produto_e_conversao_df["data"] = data_str visita_com_produto_e_conversao_df["hora"] = hour prepared = _prepare(visita_com_produto_e_conversao_df) save_partitioned(prepared, saida, SAVING_PARTITIONS) print(f"ConcluĂdo para {date_partition} {hour}h")
def dash(dataframe_path: str, data_inicial: str, data_final: str, dash_type: str, cluster_label: str, bins: int, scaler: str): axis_options = [] for j in range(len(LIST_AXIS)): combinations = [p for p in itertools.combinations(LIST_AXIS, j + 1)] for i in range(len(combinations)): if len(combinations[i]) == 2: axis_options.append(combinations[i]) figura = [] if scaler in LIST_SCALER.keys(): print(os.path.join(dataframe_path, scaler)) if os.path.isdir(os.path.join(dataframe_path, scaler)): path = os.path.join(dataframe_path, scaler) filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataframe = read_partitioned_json(path, filter_function=filter_function) expanded_cols = pd.DataFrame(dataframe["features"].values.tolist(), columns=[ "preco_s", "prazo_s", "frete_s", "latitude_s", "longitude_s" ]) dataframe = dataframe.join(expanded_cols) if cluster_label == 'cluster_label': dataframe['cluster_label'] = dataframe['cluster_label'].astype( int) # print(dataframe.describe()) if dash_type == "scatter": for axis in axis_options: p = plot_scatter(dataframe, axis[0] + "_s", axis[1] + "_s", cluster_label, title="Scatter of " + scaler + "_" + axis[0] + "_vs_" + axis[1]) figura.append(p) elif dash_type == "histogram": for axe in LIST_AXIS: p = plot_histogram(dataframe, axe, bins, title="Histograma " + axe) figura.append(p) else: print(f"Erro: Opcoes possiveis: 'scatter' ou 'histogram'") figs = gridplot(figura, ncols=len(LIST_AXIS)) return figs
def prepare_dataframe(departamentos_lista: Sequence[str], dataset_path, data_inicial: datetime, data_final: datetime): def filter_function(row): return filter_departamento(row, departamentos_lista) and filter_date( row, data_inicial, data_final) visitas = read_partitioned_json(dataset_path, filter_function) visitas_com_coordenadas = _extracting_coordinates(visitas) visitas_com_conversao = convert(visitas_com_coordenadas) result = visitas_com_conversao.drop('departamento', axis=1) return result
def main(dataframe_path: str, saida: str, y_axis, data_inicial, data_final, name_to_save): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataframe = read_partitioned_json(dataframe_path, filter_function=filter_function) output_file(saida + str(name_to_save) + ".html") figura = line_confidence_intervals(dataframe=dataframe, y_axis=y_axis) save(figura)
def main(dataframe_path: str, saida, number_of_cluster: int, data_inicial, data_final, name_to_save): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataset = read_partitioned_json(file_path=dataframe_path, filter_function=filter_function) vector = np.asarray(list(dataset['features'].to_numpy())) k, gapdf = optimalK(data=vector, maxClusters=number_of_cluster+1) output_file(saida + str(name_to_save) + ".html") figura = line_cluster_GAP(gapdf) save(figura)
def main(dataframe_path: str, saida: str, data_inicial, data_final, name_to_save): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataframe = read_partitioned_json(dataframe_path, filter_function=filter_function) output_file(saida + str(name_to_save) + ".html") figura = line_time(dataframe, data_inicial, data_final) save(figura)
def main(dataframe_path: str, saida: str, x_axis, y_axis, cluster_label, data_inicial, data_final, name_to_save): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataframe = read_partitioned_json(dataframe_path, filter_function=filter_function) output_file(saida + str(name_to_save) + ".html") figura = plot(dataframe, x_axis, y_axis, cluster_label) save(figura)
def main(dataset: str, number_of_cluster: int, saida: str, data_inicial, data_final): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataset = read_partitioned_json(file_path=dataset, filter_function=filter_function) vector = np.asarray(list(dataset['features'].to_numpy())) coordinates, labels = kmeans(vector, number_of_cluster) dataset['cluster_coordinate'] = list(coordinates) dataset['cluster_label'] = list(labels) save_partitioned(dataset, saida, ['data', 'hora'])
def main(dataframe_path: str, saida: str, number_of_cluster: int, data_inicial, data_final, name_to_save): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataframe = read_partitioned_json(dataframe_path, filter_function=filter_function) vector_cluster = list(range(2, int(number_of_cluster) + 1)) dataframe = get_bic_dataframe(dataframe, vector_cluster) output_file(saida + str(name_to_save) + ".html") figura = line_cluster_BIC(dataframe) save(figura)
def main(dataframe_path: str, saida: str, data_inicial, data_final, type): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataframe = read_partitioned_json(dataframe_path, filter_function=filter_function) if type.lower() == "hora": dataframe = dataframe.groupby(["data", "hora", "cluster_label" ]).convertido.mean().reset_index() save_partitioned(dataframe, saida, ['data', 'hora', 'cluster_label']) else: dataframe = dataframe.groupby(["data", "cluster_label" ]).convertido.mean().reset_index() save_partitioned(dataframe, saida, ['data', 'cluster_label'])
def main(dataframe_path: str, saida: str, data_inicial, data_final, name_to_save): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataframe = read_partitioned_json(dataframe_path, filter_function=filter_function) dataframe = dataframe.groupby( "cluster_label").convertido.mean().reset_index() output_file(saida + str(name_to_save) + ".html") figura = bar(dataframe, data_inicial, data_final) save(figura)
def main(dataframe_path: str, saida: str, cluster_label: str, components, data_inicial, data_final, name_to_save): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataframe = read_partitioned_json(dataframe_path, filter_function=filter_function) principalDf = get_pca(dataframe, cluster_label) output_file(saida + str(name_to_save) + ".html") figura = plot(principalDf, 'principal component 1', 'principal component 2', cluster_label) save(figura)
def main(dataframe_path: str, saida: str, axe: str, bins: int, data_inicial, data_final): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataframe = read_partitioned_json(dataframe_path, filter_function=filter_function) expanded_cols = pd.DataFrame(dataframe["features"].values.tolist(), columns=["preco_s", "prazo_s", "frete_s", "latitude_s", "longitude_s"]) dataframe = dataframe.join(expanded_cols) output_file(saida) p1 = plot_histogram(dataframe, axe, bins, title="Histograma " + axe + "_original") p2 = plot_histogram(dataframe, axe + "_s", bins, title="Histograma " + axe + "_escalado") figura = gridplot([p1, p2], ncols=2) save(figura)
def main(dataset: str, number_of_cluster: int, saida: str, data_inicial, data_final, batch_size: int): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataset = read_partitioned_json(file_path=dataset, filter_function=filter_function) vector = np.asarray(list(dataset['features'].to_numpy())) labels, inertia, score = minibatch_kmeans(vector, number_of_cluster, batch_size) print(f"Clusters inertia: {inertia}") print(f"SCORE: {score}") dataset[f'cluster_label'] = list(labels) print(f"Saving partitioned by cluster first..") save_partitioned(dataset, os.path.join(saida,"by_cluster"), ['cluster_label', 'data', 'hora']) print(f"Saving partitioned by data first..") save_partitioned(dataset, os.path.join(saida,"by_data"), ['data', 'hora', 'cluster_label'])
def main(dataset: str, number_of_cluster: int, threshold: float, saida: str, data_inicial, data_final): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataset = read_partitioned_json(file_path=dataset, filter_function=filter_function) vector = np.asarray(list(dataset['features'].to_numpy())) labels = birch(vector, number_of_cluster, threshold) dataset[f'cluster_label'] = list(labels) print(f"Saving partitioned by cluster first..") save_partitioned(dataset, os.path.join(saida, "by_cluster"), ['cluster_label', 'data', 'hora']) print(f"Saving partitioned by data first..") save_partitioned(dataset, os.path.join(saida, "by_data"), ['data', 'hora', 'cluster_label'])