Esempio n. 1
0
def test_parallelism():
    """Tests operator parallelism."""
    env = Environment()
    # Try setting a common parallelism for all operators
    env.set_parallelism(2)
    stream = env.source(None).map(None).filter(None).flat_map(None)
    env._collect_garbage()
    for operator in env.operators.values():
        if operator.type == OpType.Source:
            # TODO (john): Currently each source has only one instance
            assert operator.num_instances == 1, (operator.num_instances, 1)
        else:
            assert operator.num_instances == 2, (operator.num_instances, 2)
    # Check again after adding an operator with different parallelism
    stream.map(None, "Map1").shuffle().set_parallelism(3).map(
        None, "Map2").set_parallelism(4)
    env._collect_garbage()
    for operator in env.operators.values():
        if operator.type == OpType.Source:
            assert operator.num_instances == 1, (operator.num_instances, 1)
        elif operator.name != "Map1" and operator.name != "Map2":
            assert operator.num_instances == 2, (operator.num_instances, 2)
        elif operator.name != "Map2":
            assert operator.num_instances == 3, (operator.num_instances, 3)
        else:
            assert operator.num_instances == 4, (operator.num_instances, 4)
Esempio n. 2
0
# Returns the second attribute of a tuple
def attribute_selector(tuple):
    return tuple[1]


if __name__ == "__main__":
    # Get program parameters
    args = parser.parse_args()
    titles_file = str(args.titles_file)

    ray.init()

    # A Ray streaming environment with the default configuration
    env = Environment()
    env.set_parallelism(2)  # Each operator will be executed by two actors

    # The following dataflow is a simple streaming wordcount
    #  with a rolling sum operator.
    # It reads articles from wikipedia, splits them in words,
    # shuffles words, and counts the occurences of each word.
    stream = env.source(Wikipedia(titles_file)) \
                .round_robin() \
                .flat_map(splitter) \
                .key_by(key_selector) \
                .sum(attribute_selector) \
                .inspect(print)     # Prints the contents of the
    # stream to stdout
    start = time.time()
    env_handle = env.execute()  # Deploys and executes the dataflow
    ray.get(env_handle)  # Stay alive until execution finishes