def test_parallelism(): """Tests operator parallelism.""" env = Environment() # Try setting a common parallelism for all operators env.set_parallelism(2) stream = env.source(None).map(None).filter(None).flat_map(None) env._collect_garbage() for operator in env.operators.values(): if operator.type == OpType.Source: # TODO (john): Currently each source has only one instance assert operator.num_instances == 1, (operator.num_instances, 1) else: assert operator.num_instances == 2, (operator.num_instances, 2) # Check again after adding an operator with different parallelism stream.map(None, "Map1").shuffle().set_parallelism(3).map( None, "Map2").set_parallelism(4) env._collect_garbage() for operator in env.operators.values(): if operator.type == OpType.Source: assert operator.num_instances == 1, (operator.num_instances, 1) elif operator.name != "Map1" and operator.name != "Map2": assert operator.num_instances == 2, (operator.num_instances, 2) elif operator.name != "Map2": assert operator.num_instances == 3, (operator.num_instances, 3) else: assert operator.num_instances == 4, (operator.num_instances, 4)
# Returns the second attribute of a tuple def attribute_selector(tuple): return tuple[1] if __name__ == "__main__": # Get program parameters args = parser.parse_args() titles_file = str(args.titles_file) ray.init() # A Ray streaming environment with the default configuration env = Environment() env.set_parallelism(2) # Each operator will be executed by two actors # The following dataflow is a simple streaming wordcount # with a rolling sum operator. # It reads articles from wikipedia, splits them in words, # shuffles words, and counts the occurences of each word. stream = env.source(Wikipedia(titles_file)) \ .round_robin() \ .flat_map(splitter) \ .key_by(key_selector) \ .sum(attribute_selector) \ .inspect(print) # Prints the contents of the # stream to stdout start = time.time() env_handle = env.execute() # Deploys and executes the dataflow ray.get(env_handle) # Stay alive until execution finishes