Ejemplo n.º 1
0
    def test_repr(self):
        pipeline_manager = PipelineManager(api=API_ADDRESS,
                                           broadcast=BROADCAST_ADDRESS)
        pipeline_ids = [uuid4().hex for i in range(10)]
        pipeline_ids_copy = pipeline_ids[:]
        pipeline_manager.send_api_request = lambda x: None
        pipeline_manager.get_api_reply = \
                lambda: {'pipeline id': pipeline_ids.pop()}
        pipelines = [Pipeline({Job('A', data={'index': i}): Job('B')}) \
                     for i in range(10)]
        for pipeline in pipelines:
            pipeline_manager.start(pipeline)

        result = repr(pipeline_manager)
        self.assertEqual(result, '<PipelineManager: 10 submitted, 0 finished>')

        messages = [
            'pipeline finished: id={}, duration=0.1'.format(pipeline_id)
            for pipeline_id in pipeline_ids_copy[:3]
        ]
        poll = [False, True, True, True]

        def new_poll(timeout):
            return poll.pop()

        def new_broadcast_receive():
            return messages.pop()

        pipeline_manager.broadcast_poll = new_poll
        pipeline_manager.broadcast_receive = new_broadcast_receive
        pipeline_manager.update(0.1)
        result = repr(pipeline_manager)
        self.assertEqual(result, '<PipelineManager: 10 submitted, 3 finished>')
Ejemplo n.º 2
0
def send_pipeline_and_wait_finished():
    pipeline_manager = PipelineManager(api=API_ADDRESS,
                                       broadcast=BROADCAST_ADDRESS)
    pipelines = []
    for i in range(10):
        pipeline = Pipeline(
            {
                Job(u'worker_1'): Job(u'worker_2'),
                Job(u'worker_2'): Job(u'worker_3')
            },
            data={'index': i})
        pipeline_manager.start(pipeline)
        pipelines.append(pipeline)
    assert pipeline_manager.started_pipelines == 10
    assert pipeline_manager.finished_pipelines == 0
    start = time()
    pipeline_manager.finished(pipelines[0])  # only for testing this method
    while pipeline_manager.finished_pipelines < pipeline_manager.started_pipelines:
        pipeline_manager.update(0.5)
    end = time()
    pipeline_manager.disconnect()
    return {
        'duration': pipeline.duration,
        'real_duration': end - start,
        'finished_pipelines': pipeline_manager.finished_pipelines,
        'started_pipelines': pipeline_manager.started_pipelines
    }
Ejemplo n.º 3
0
    def test_repr(self):
        pipeline_manager = PipelineManager(api=API_ADDRESS,
                                           broadcast=BROADCAST_ADDRESS)
        pipeline_ids = [uuid4().hex for i in range(10)]
        pipeline_ids_copy = pipeline_ids[:]
        pipeline_manager.send_api_request = lambda x: None
        pipeline_manager.get_api_reply = \
                lambda: {'pipeline id': pipeline_ids.pop()}
        pipelines = [Pipeline({Job('A', data={'index': i}): Job('B')}) \
                     for i in range(10)]
        for pipeline in pipelines:
            pipeline_manager.start(pipeline)

        result = repr(pipeline_manager)
        self.assertEqual(result, '<PipelineManager: 10 submitted, 0 finished>')

        messages = ['pipeline finished: id={}, duration=0.1'.format(pipeline_id)
                    for pipeline_id in pipeline_ids_copy[:3]]
        poll = [False, True, True, True]
        def new_poll(timeout):
            return poll.pop()
        def new_broadcast_receive():
            return messages.pop()
        pipeline_manager.broadcast_poll = new_poll
        pipeline_manager.broadcast_receive = new_broadcast_receive
        pipeline_manager.update(0.1)
        result = repr(pipeline_manager)
        self.assertEqual(result, '<PipelineManager: 10 submitted, 3 finished>')
Ejemplo n.º 4
0
def main():
    pipeline_definition = {Job('Downloader'): (Job('GetTextAndWords'),
                                               Job('GetLinks'))}
    urls = ['http://www.fsf.org', 'https://creativecommons.org',
            'http://emap.fgv.br', 'https://twitter.com/turicas',
            'http://www.pypln.org', 'http://www.zeromq.org',
            'http://www.python.org', 'http://www.mongodb.org',
            'http://github.com', 'http://pt.wikipedia.org']

    pipeline_manager = PipelineManager(api='tcp://127.0.0.1:5555',
                                       broadcast='tcp://127.0.0.1:5556')
    print 'Sending pipelines...'
    start_time = time()
    my_pipelines = []
    for index, url in enumerate(urls):
        filename = '/tmp/{}.dat'.format(index)
        data = json.dumps({'url': url})
        with open(filename, 'w') as fp:
            fp.write(data)
        pipeline = Pipeline(pipeline_definition, data={'filename': filename})
        pipeline_manager.start(pipeline)
        print '  Sent pipeline for url={}'.format(url)

    print
    print 'Waiting for pipelines to finish...'
    total_pipelines = pipeline_manager.started_pipelines
    finished_pipelines = 0
    while finished_pipelines < total_pipelines:
        pipeline_manager.update(0.5)
        finished_pipelines = pipeline_manager.finished_pipelines
        percentual = 100 * (float(finished_pipelines) / total_pipelines)
        sys.stdout.write('\rFinished pipelines: {}/{} ({:5.2f}%)'\
                         .format(finished_pipelines, total_pipelines,
                                 percentual))
        sys.stdout.flush()
    end_time = time()
    print '\rAll pipelines finished in {} seconds'.format(end_time - start_time)

    durations = [pipeline.duration for pipeline in pipeline_manager.pipelines]
    average_duration = sum(durations) / len(durations)
    print 'Average pipeline duration (seconds) = {} (min={}, max={})'\
          .format(average_duration, min(durations), max(durations))
    print

    print 'Some data saved by store:'
    for index, url in enumerate(urls):
        filename = '/tmp/{}.dat'.format(index)
        with open(filename) as fp:
            data = json.loads(fp.read())
        print ('  url={url}, download_duration={download_duration}, '
               'number_of_words={number_of_words}, '
               'number_of_links={number_of_links}'.format(**data))
Ejemplo n.º 5
0
def send_pipeline_and_wait_finished():
    pipeline_manager = PipelineManager(api=API_ADDRESS,
                                       broadcast=BROADCAST_ADDRESS)
    pipelines = []
    for i in range(10):
        pipeline = Pipeline({Job(u'worker_1'): Job(u'worker_2'),
                             Job(u'worker_2'): Job(u'worker_3')},
                            data={'index': i})
        pipeline_manager.start(pipeline)
        pipelines.append(pipeline)
    assert pipeline_manager.started_pipelines == 10
    assert pipeline_manager.finished_pipelines == 0
    start = time()
    pipeline_manager.finished(pipelines[0]) # only for testing this method
    while pipeline_manager.finished_pipelines < pipeline_manager.started_pipelines:
        pipeline_manager.update(0.5)
    end = time()
    pipeline_manager.disconnect()
    return {'duration': pipeline.duration, 'real_duration': end - start,
            'finished_pipelines': pipeline_manager.finished_pipelines,
            'started_pipelines': pipeline_manager.started_pipelines}