Esempio n. 1
0
def cli():
    """Preprocess the samples for BDT optimization.
    """
    # Load the configuration module.
    config = load_config()
    # Create the output directory.
    safe_makedirs('sample')
    # Preprocess the samples in parallel. To guard against deadlock, the number
    # of workers is chosen to be the smaller of the number of available cores
    # or the number of samples to preprocess.
    samples = config.SIGNAL + config.BACKGROUND
    max_workers = min(multiprocessing.cpu_count(), len(samples))
    tasks = []
    with futures.ProcessPoolExecutor(max_workers) as executor:
        for sample in samples:
            # The configuration module cannot be pickled, so pass the options directly.
            tasks.append(
                executor.submit(worker, sample, config.DIRECTORY,
                                config.SELECTION, config.BRANCHES,
                                config.TARGET_LUMI))
        with click.progressbar(label='Preprocessing Samples',
                               length=len(samples),
                               show_pos=True,
                               show_percent=False) as bar:
            for task in futures.as_completed(tasks):
                bar.update(1)
Esempio n. 2
0
def proxy_checker(proxies):
	'''
		proxies is a list of {key:value}, where the key is the ip of the proxy (including port), e.g., 192.168.1.1:8080, and the value is the type of the proxy (http/https)
	'''

	logger.info('%d proxies to check'%(len(proxies)))
	import multiprocessing as mp
	

	results = []
	with futures.ProcessPoolExecutor(max_workers=mp.cpu_count()*10) as executor:

		future_to_proxy = {executor.submit(check_proxy, proxy, 30): proxy for proxy in proxies if proxy.values()[0] == 'http'}

		for future in future_to_proxy:
			future.add_done_callback(lambda f: results.append(f.result()))
			
		logger.info('%d http proxies to check'%(len(future_to_proxy)))

		futures.wait(future_to_proxy)

		# for future in futures.as_completed(future_to_proxy):

		# 	proxy = future_to_proxy[future]
		# 	try:
		# 		good, proxy_dict = future.result()
		# 	except Exception as exc:
		# 		logger.info('%r generated an exception: %s'%(proxy, exc))
		# 	else:
		# 		if (good):
		# 			good_proxies.append(proxy_dict)
		
		return [p for (good, p) in results if good]
Esempio n. 3
0
def grade(model,
          generation,
          pop,
          x_train,
          y_train,
          x_valid,
          y_valid,
          target,
          do_parallel=False):
    results = np.zeros(len(pop))
    print 'grade:', chromosomes.keys()
    if do_parallel:
        # pickle is having error during multi process.
        with futures.ProcessPoolExecutor(max_workers=cpu_count()) as executor:
            for i, individual in enumerate(pop):
                f = executor.submit(fitness_, model, i, individual, x_train,
                                    y_train, x_valid, y_valid, target)
                index, score = f.result()
                results[index] = score
    else:
        for i, individual in enumerate(pop):
            results[i] = fitness(model, individual, x_train, y_train, x_valid,
                                 y_valid, target)

    mean_ = np.mean(results)

    print_grade(generation, results, pop)

    return mean_
Esempio n. 4
0
 def flush(self, bucket):
     with futures.ProcessPoolExecutor(max_workers=1) as executor:
         # for each bucket it's a dict,
         # where the key needs to be the file name;
         # and the value is a list of json encoded value
         for bucket, items in self.buffer.iteritems():
             if len(items) > 0:
                 f = executor.submit(flush_bucket, bucket, items)
                 # send to a different process to operate, clear the buffer
                 self.clear(bucket)
                 #flush_bucket(bucket, items)
                 # self.clear(bucket)
                 self.futures.append(f)
     return True
Esempio n. 5
0
def json2agreementmatrix(jsonflist,start=2,maxlen=0,task_type='all'):
    """ Multi process function to convert 2 json file annotation combination to
    agreement values (alpha,kappa,Avg Observed agreement)

        Args:
           jsonflist (list):  list of json filenames.
           start (int): combination group size to begin with.
           maxlen(int): maximum count starting from :data:'start'


        Kwargs:
           state (bool): Current state to be in.

        Returns:
           A dict mapping annotator combination to agreement values then
            pickled, yamled and csved.

        Raises:
           Future.Exception
        """
    future_list=[]
    detaildata={}

    flen=len(jsonflist)


    assert start+maxlen-2<=flen



    with futures.ProcessPoolExecutor() as executor:
        for cnt in range(start,start+maxlen+1):
            for tpl in list(itertools.combinations(jsonflist,cnt)):
                future_list.append(executor.submit(getagreement,tpl,os.path.dirname(jsonflist[0]),task_type))


        for future in futures.as_completed(future_list):
            if future.exception() is not None:
                print('%r generated an exception: %s' % (future,
                                                     future.exception()))
            else:

                detaildata.update( future.result())


    yaml.dump(detaildata,open(os.path.dirname(jsonflist[0])+'\\'+str(start)+'-'+str(start+maxlen)+'out.yaml','w'))
    csvdump(detaildata,open(os.path.dirname(jsonflist[0])+'\\'+str(start)+'-'+str(start+maxlen)+'out.csv','w'))
    print "Dumped output"
    return detaildata
    def flush(self, bucket):
        logger.debug("i'm getting flushed...")

        with futures.ProcessPoolExecutor(max_workers=1) as executor:
            for k, v in self.buffer[bucket].iteritems():
                for s in v:
                    o = json.loads(s)

                    f = executor.submit(flush_cmd, o[self.data_type], self.data_type, self.template, self.redis_config)

                    self.futures.append(f)

            # send to a different process to operate, clear the buffer
            self.clear(bucket)

        return True
Esempio n. 7
0
def get_perspectives(url):
    '''Get different perspectives on the topic covered by article.

  Args:
    url: A string.

  Returns:
    A JSON-encoded string representing other articles with different
    perspectives than the original article.

    Format: a list of Article.to_dict()s, each with an additional 'sentences'
    attribute. 'sentences' contains a list of sentences with semantically
    different words that were extracted from the corresponding article's body.
  '''
    article = url_to_article(url)
    if article:
        headline = article.headline
        body = article.body
        org = article.news_org

        article_topic = extract_keywords.extract_keywords(headline)

        (NP_to_sentence, VP_to_sentence, NPs, VPs, NP_synsets, VP_synsets) = \
            get_article_phrases(body, org)

        n = len(NEWS_ORGS)
        with futures.ProcessPoolExecutor(max_workers=n) as executor:
            comparisons = executor.map(get_comparison, NEWS_ORGS,
                                       [article_topic] * n,
                                       [NP_to_sentence] * n,
                                       [VP_to_sentence] * n, [NPs] * n,
                                       [VPs] * n, [NP_synsets] * n,
                                       [VP_synsets] * n, [1] * n)
            compared_articles_by_org = list(comparisons)
            # flatten from list of lists of articles (separated by news org) to list
            # of articles
            compared_articles = [
                article for org_articles in compared_articles_by_org
                for article in org_articles
            ]
            return json.dumps(compared_articles)
    else:
        return json.dumps("Not a recognized article")
Esempio n. 8
0
def run(socket, channels, cmds, nick, logfile):
    # buffer for some command received
    buff = ''
    num_workers = sum(len(v) for k, v in cmds.iteritems())

    #TODO: what happens if I use all the workers?

    #TODO: don't let commands to run for more than one minute

    with futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
        while len(channels):
            receive = socket.recv(4096)
            buff = buff + receive
            response = ''

            if receive:
                log_write(logfile, get_datetime()['time'], ' <> ', receive + \
                    ('' if '\n' == receive[len(receive)-1] else '\n'))

            if -1 != buff.find('\n'):
                # get a full command from the buffer
                command = buff[0:buff.find('\n')]
                buff = buff[buff.find('\n') + 1:]

                # command's components after parsing
                components = parser.parse_command(command)
                to = send_to(command)

                if 'PING' == components['action']:
                    response = []
                    response.append('PONG')
                    response.append(':' + components['arguments'])

                elif 'PRIVMSG' == components['action']:
                    if '!' == components['arguments'][0]:
                        # a command from a user only makes sense if it starts
                        # with an exclamation mark

                        pos = components['arguments'].find(' ')
                        if -1 == pos:
                            pos = len(components['arguments'])

                        # get the command issued to the bot without the "!"
                        cmd = components['arguments'][1:pos]

                        callable_cmd = get_cmd(cmd, cmds['user'], logfile)
                        if callable_cmd:
                            run_cmd(socket, executor, to, callable_cmd,
                                    components, logfile)
                        else:
                            callable_cmd = get_cmd(cmd, cmds['core'], logfile)

                            if callable_cmd:
                                try:
                                    response = callable_cmd(socket, components)
                                except Exception as e:
                                    response = err.C_EXCEPTION.format(
                                        callable_cmd.__name__)

                                    log_write(logfile, response, ' <> ',
                                              str(e) + '\n')

                    # run auto commands
                    for cmd in config.cmds['auto']:
                        callable_cmd = get_cmd(cmd, cmds['auto'], logfile)
                        if callable_cmd:
                            run_cmd(socket, executor, to, callable_cmd,
                                    components, logfile)

                elif 'KICK' == components['action'] and \
                    nick == components['action_args'][1]:
                    channels.remove(components['action_args'][0])

                elif 'QUIT' == components['action'] and \
                        -1 != components['arguments'].find('Ping timeout: '):
                    channels[:] = []

                # this call is still necessary in case that a PONG response or a
                # core command response should be sent, every other response is
                # sent when the futures finish working from their respective
                # thread
                send_response(response, to, socket, logfile)

                buff = ''
        V.add_plot({'type': 'raster',
                    'ids': {0: neu_pub},
                    #'yticks': range(1, 1+len(neu_out)),
                    #'yticklabels': range(len(neu_out))
                    },
                    'Generic LPU %s' % i, 'Output')

    V._update_interval = 50
    V.rows = 3
    V.cols = 1
    V.fontsize = 18
    V.out_filename = 'generic_output_%s.avi' % out_name
    V.codec = 'libtheora'
    V.dt = 0.0001
    V.xlim = [0, 1.0]
    V.run()

# Run the visualizations in parallel:
with futures.ProcessPoolExecutor() as executor:
    fs_dict = {}
    for out_name in ['un', 'co']:
        res = executor.submit(run, out_name)
        fs_dict[out_name] = res
    futures.wait(fs_dict.values())

    # Report any exceptions that may have occurred:
    for k in fs_dict:
        e = fs_dict[k].exception()
        if e:
            print '%s: %s' % (k, e)
Esempio n. 10
0
def start_server(config, proxies):
    import copy

    check_config(config)
    config = copy.copy(config)

    folders_to_create = []
    buckets = [
        "tweets", "followers", "follower_ids", "friends", "friend_ids",
        "timelines"
    ]

    ouput_folder = os.path.abspath(config['output'])
    archive_output = os.path.abspath(
        config['archive_output']) if config['archive_output'] else ouput_folder
    archive_output = os.path.join(archive_output, 'archived')

    folders_to_create.append(ouput_folder)
    folders_to_create.append(archive_output)

    for bucket in buckets:
        folders_to_create.append(os.path.join(ouput_folder, bucket))
        folders_to_create.append(os.path.join(archive_output, bucket))

    for folder_to_create in folders_to_create:
        if (not os.path.exists(folder_to_create)):
            os.makedirs(folder_to_create)

    logger.info("output to %s" % (ouput_folder))
    logger.info("archived to %s" % (archive_output))

    this_node_id = node_id()
    node_queue = NodeQueue(this_node_id, redis_config=config['redis_config'])
    node_queue.clear()

    scheduler = Scheduler(this_node_id, config=config, proxies=proxies)

    logger.info('starting node_id: %s' % this_node_id)

    node_coordinator = NodeCoordinator(config['redis_config'])
    #node_coordinator.clear()

    #the main event loop, actually we don't need one, since we can just join on the crawlers and don't stop until a terminate command is issued to each crawler;
    #but we need one to report the status of each crawler and perform the tarball tashs...

    last_archive_ts = time.time(
    ) + 3600  # the first archive event starts 2 hrs later...
    pre_time = time.time()
    last_load_balancing_task_ts = time.time()
    while True:

        if (time.time() - pre_time > 120):
            logger.info(pprint.pformat(scheduler.crawler_status()))
            pre_time = time.time()
            if (scheduler.is_alive()):
                cmd = {'cmd': 'CRAWLER_FLUSH'}
                scheduler.enqueue(cmd)

        if (time.time() - last_archive_ts > 3600):

            logger.info("start archive procedure...")
            with futures.ProcessPoolExecutor(
                    max_workers=len(buckets)) as executor:

                future_proxies = {
                    executor.submit(tarball_results, ouput_folder, bucket,
                                    archive_output,
                                    int(time.time()) - 3600): bucket
                    for bucket in buckets
                }

                for future in future_proxies:
                    future.add_done_callback(lambda f: logger.info(
                        "archive created? %s: [%s]" % f.result()))

            last_archive_ts = time.time()

        # block, the main process...for a command
        if (not scheduler.is_alive()):
            logger.info(
                "no crawler is alive... waiting to recreate all crawlers...")
            time.sleep(120)  # sleep for a minute and retry
            continue

        if (time.time() - last_load_balancing_task_ts >
                1800):  # try to balance the local queues every 30 mins
            last_load_balancing_task_ts = time.time()
            cmd = {'cmd': 'BALANCING_LOAD'}
            scheduler.enqueue(cmd)

        cmd = node_queue.get(block=True, timeout=360)

        if cmd:
            scheduler.enqueue(cmd)
Esempio n. 11
0
def main():
    with futures.ProcessPoolExecutor(max_workers=3) as executor:
        list(executor.map(worker, range(10)))
Esempio n. 12
0
async def startup(ctx):
    ctx['pool'] = futures.ProcessPoolExecutor()