# Manufacturing Entropy - Code for the article # Original Data: Kaggle competition - Bosch Manufacturing Data: train_date.csv file # Data used in this file: pre-processed ("data/manufacturing_paths.txt" and "data/manufacturing_edges.txt") # Yamila Mariel Omar # Date of original code: 21st July 2020 from graphfile import GraphFile # Load manufacturing paths paths # ============================== paths = GraphFile("data/manufacturing_paths.txt").read_paths_with_count() number_of_manufactured_items = sum(paths.values()) # Load edges # ========== edges = GraphFile("data/manufacturing_edges.txt").read_edges_from_file() # Filter out edges that only serve a handfull of items # ==================================================== threshold = 0.001 * number_of_manufactured_items edges_to_remove = [k for k,v in edges.items() if v < threshold] edges_to_remove = set(edges_to_remove) # Clean manufacturing paths # ========================= clean_paths = {} for k,v in paths.items(): e = [(i,j) for (i,j) in zip(k, k[1:])] e = set(e) if len(edges_to_remove.intersection(e)) == 0:
paths = GraphFile(filename_paths).read_paths_with_count() # Generate graph from clean edges # =============================== edges = add_self_loops(paths, edges) G = Graph(edges) # ========================================================================== # Get frequency of n in paths # ========================================================================== freq_n_in_paths = {n: 0 for n in G.nodes} for p, v in paths.items(): for n in p: freq_n_in_paths[n] += v number_of_paths = sum(list(paths.values())) freq_n_in_paths = { k: v / number_of_paths for k, v in freq_n_in_paths.items() } # Make plot # ========= x = list(freq_n_in_paths.keys()) x.sort() freq = [freq_n_in_paths[n] for n in x] x_pos = [i for i, _ in enumerate(x)] plt.barh(x_pos, freq, color='red') plt.ylabel("Nodes")