def main(visitas_com_conversao, saida, data_inicial, data_final, departamentos, scaler=None): departamentos_lista = [ departamento.strip() for departamento in departamentos.split(",") ] print(departamentos_lista) result = prepare_dataframe(departamentos_lista, visitas_com_conversao, data_inicial, data_final) # Faz a escala dos valores if scaler in LIST_SCALER.keys(): result = transform(result, LIST_SCALER[scaler]) saida = os.path.join(saida, scaler) print(f"Scaling with {scaler}") # salva resultado save_partitioned(result, saida, ['data', 'hora']) print(f'saved ok') elif scaler is None: result = prepare_features(result) saida = os.path.join(saida, "sem_normalizar") print(f"Preparing data without scaling") # salva resultado save_partitioned(result, saida, ['data', 'hora']) print(f'saved ok') else: print(f"Error: Input correct key or None for original data")
def main(dataset: str, saida: str, data_inicial, data_final, samples: int): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) print(dataset) df = read_partitioned_json(file_path=dataset, filter_function=filter_function) print(df) vector = np.asarray(list(df['features'].to_numpy())) labels = optics(vector, samples) clust_quantity = set(labels) print(clust_quantity) print(len(clust_quantity)) print(f"saving clusters...") df['cluster_label'] = list(labels) print(f"Saving partitioned by cluster first..") save_partitioned(df, os.path.join(saida, "by_cluster"), ['cluster_label', 'data', 'hora']) print(f"Saving partitioned by data first..") save_partitioned(df, os.path.join(saida, "by_data"), ['data', 'hora', 'cluster_label'])
def main(dataset: str, number_of_cluster: int, saida: str, data_inicial, data_final): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataset = read_partitioned_json(file_path=dataset, filter_function=filter_function) vector = np.asarray(list(dataset['features'].to_numpy())) coordinates, labels = kmeans(vector, number_of_cluster) dataset['cluster_coordinate'] = list(coordinates) dataset['cluster_label'] = list(labels) save_partitioned(dataset, saida, ['data', 'hora'])
def main(dataframe_path: str, saida: str, data_inicial, data_final, type): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataframe = read_partitioned_json(dataframe_path, filter_function=filter_function) if type.lower() == "hora": dataframe = dataframe.groupby(["data", "hora", "cluster_label" ]).convertido.mean().reset_index() save_partitioned(dataframe, saida, ['data', 'hora', 'cluster_label']) else: dataframe = dataframe.groupby(["data", "cluster_label" ]).convertido.mean().reset_index() save_partitioned(dataframe, saida, ['data', 'cluster_label'])
def main(visitas_com_conversao, saida, data_inicial, data_final, departamentos): departamentos_lista = [ departamento.strip() for departamento in departamentos.split(",") ] result = prepare_dataframe(departamentos_lista, visitas_com_conversao, data_inicial, data_final) # Faz a escala dos valores result_scaled = transform(result, Normalizer()) # salva resultado save_partitioned(result_scaled, saida, ['data', 'hora'])
def main(visitas_com_conversao, saida, data_inicial, data_final, departamentos): if departamentos.lower() == "all": departamentos_lista = get_departamentos_all() else: departamentos_lista = [departamento.strip() for departamento in departamentos.split(",")] result = prepare_dataframe(departamentos_lista, visitas_com_conversao, data_inicial, data_final) # Faz a escala dos valores result = transform(result, MinMaxScaler()) # salva resultado numa nova pasta saida = os.path.join(saida, "min_max_scaler") os.mkdir(saida) save_partitioned(result, saida, ['data', 'hora'])
def main(dataset: str, number_of_cluster: int, saida: str, data_inicial, data_final, batch_size: int): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataset = read_partitioned_json(file_path=dataset, filter_function=filter_function) vector = np.asarray(list(dataset['features'].to_numpy())) labels, inertia, score = minibatch_kmeans(vector, number_of_cluster, batch_size) print(f"Clusters inertia: {inertia}") print(f"SCORE: {score}") dataset[f'cluster_label'] = list(labels) print(f"Saving partitioned by cluster first..") save_partitioned(dataset, os.path.join(saida,"by_cluster"), ['cluster_label', 'data', 'hora']) print(f"Saving partitioned by data first..") save_partitioned(dataset, os.path.join(saida,"by_data"), ['data', 'hora', 'cluster_label'])
def main(dataframe_path: str, saida: str, data_inicial, data_final, cluster_method: str, scaler: str, timescale: str): output_file(os.path.join(saida, "serie_timescale.html")) figura, df = plot_series(dataframe_path, data_inicial, data_final, cluster_method, scaler, timescale, title="") print(f"Saving agregated by scale and cluster..") save_partitioned(df, os.path.join(saida, "series", cluster_method), [timescale, 'cluster_label']) save(figura)
def main(pedidos, visitas, produtos, saida, data_inicial, data_final): produtos_df = read_csv(produtos) produtos_df["product_id"] = produtos_df["product_id"].astype(str) delta: timedelta = (data_final - data_inicial) date_partitions = [ data_inicial.date() + timedelta(days=days) for days in range(delta.days) ] for data in date_partitions: hour_partitions = list(range(0, 23)) for hour in hour_partitions: hour_snnipet = f"hora={hour}" data_str = data.strftime('%Y-%m-%d') date_partition = f"data={data_str}" visitas_partition = os.path.join(visitas, date_partition, hour_snnipet) visitas_df = read_partitioned_json(visitas_partition) visitas_df["product_id"] = visitas_df["product_id"].astype(str) visitas_df["visit_id"] = visitas_df["visit_id"].astype(str) pedidos_partition = os.path.join(pedidos, date_partition, hour_snnipet) pedidos_df = read_partitioned_json(pedidos_partition) pedidos_df["visit_id"] = pedidos_df["visit_id"].astype(str) visita_com_produto_df = visitas_df.merge(produtos_df, how="inner", on="product_id", suffixes=("", "_off")) visita_com_produto_e_conversao_df = visita_com_produto_df.merge( pedidos_df, how="left", on="visit_id", suffixes=("", "_off")) visita_com_produto_e_conversao_df["data"] = data_str visita_com_produto_e_conversao_df["hora"] = hour prepared = _prepare(visita_com_produto_e_conversao_df) save_partitioned(prepared, saida, SAVING_PARTITIONS) print(f"ConcluĂdo para {date_partition} {hour}h")
def main(dataset: str, number_of_cluster: int, threshold: float, saida: str, data_inicial, data_final): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataset = read_partitioned_json(file_path=dataset, filter_function=filter_function) vector = np.asarray(list(dataset['features'].to_numpy())) labels = birch(vector, number_of_cluster, threshold) dataset[f'cluster_label'] = list(labels) print(f"Saving partitioned by cluster first..") save_partitioned(dataset, os.path.join(saida, "by_cluster"), ['cluster_label', 'data', 'hora']) print(f"Saving partitioned by data first..") save_partitioned(dataset, os.path.join(saida, "by_data"), ['data', 'hora', 'cluster_label'])
def main(dataset: str, number_of_cluster: int, saida: str, data_inicial, data_final): filter_function = partial(filter_date, data_inicial=data_inicial, data_final=data_final) dataset = read_partitioned_json(file_path=dataset, filter_function=filter_function) vector = np.asarray(list(dataset['features'].to_numpy())) labels = spectral_clustering(vector, number_of_cluster) dataset['cluster_label'] = list(labels) saida = os.path.join(saida, "spectral_clustering") os.mkdir(saida) by_cluster_data_hora = os.path.join(saida, "by_cluster_data_hora") os.mkdir(by_cluster_data_hora) by_data_hora_cluster = os.path.join(saida, "by_data_hora_cluster") os.mkdir(by_data_hora_cluster) by_data_hora = os.path.join(saida, "by_data_hora") os.mkdir(by_data_hora) print("Directory successfully created") save_partitioned(dataset, by_cluster_data_hora, ['cluster_label', 'data', 'hora']) save_partitioned(dataset, by_data_hora_cluster, ['data', 'hora', 'cluster_label']) save_partitioned(dataset, by_data_hora, ['data', 'hora'])
def save_prepared(saida: str, dataframe: pd.DataFrame): # extracting time and hour from timestamp save_partitioned(dataframe, saida, SAVING_PARTITIONS)
def save_prepared(saida: str, visita_com_produto_e_conversao_df: pd.DataFrame): prepared = _prepare(visita_com_produto_e_conversao_df) save_partitioned(prepared, saida, SAVING_PARTITIONS)