Example #1
0
class _Worker(object):
    def __init__(self, protocol=None):
        self.protocol = protocol
        self.pool = ProcessPoolExecutor(max_workers=1)
        self.pool.submit(id, 42).result()  # start the worker process

    def run(self, func, *args, **kwargs):
        """Synchronous remote function call"""

        input_payload = dumps((func, args, kwargs), protocol=self.protocol)
        result_payload = self.pool.submit(
            call_func, input_payload, self.protocol).result()
        result = loads(result_payload)

        if isinstance(result, BaseException):
            raise result
        return result

    def memsize(self):
        workers_pids = [p.pid if hasattr(p, "pid") else p
                        for p in list(self.pool._processes)]
        num_workers = len(workers_pids)
        if num_workers == 0:
            return 0
        elif num_workers > 1:
            raise RuntimeError("Unexpected number of workers: %d"
                               % num_workers)
        return psutil.Process(workers_pids[0]).memory_info().rss

    def close(self):
        self.pool.shutdown(wait=True)
Example #2
0
 def on_message(self, message):
     print len(message)
     result = yield tornado.gen.Task(self.process_message, message)
     return
     pool = ProcessPoolExecutor()
     fut = pool.submit(call_process, message)
     ret = yield fut
     pool.shutdown()
Example #3
0
def splice_gmaps(threadpool, tilefolder, tempfiles, name):
    processpool = ProcessPoolExecutor()
    caption = "Rendering Zoom Layers {}".format(name)
    loadingbar = Bar(caption=caption)
    loadingbar.set_progress(0, caption)
    pygame.display.update()

    side = 1600
    zoom_levels = 4
    factor = 2 ** (zoom_levels - 1)
    masterside = side * factor
    plates = generate_plate_coords(factor, tempfiles)

    master_surface = pygame.Surface((masterside, masterside))

    done = 0
    total = len(tempfiles) + len(plates) * sum((4 ** x for x in range(zoom_levels)))
    fraction = 100 / total

    def render_base_to_master(task):
        imgdata, size, location = task.result()
        tempsurf = pygame.image.frombuffer(imgdata, size, "RGB")
        master_surface.blit(tempsurf, location)

    tasks = []
    for masterpos, pieces in plates.items():
        master_surface.fill((132, 170, 248))

        for x, y in pieces:
            task = processpool.submit(unpack, tempfiles, x, y, ((x % factor) * side, (y % factor) * side))
            tasks.append(threadpool.submit(render_base_to_master, task))
            tasks.append(task)
        current_area = masterside

        for task in tasks:
            task.result()
            done += 0.5
            loadingbar.set_progress(done * fraction, caption + " %4d of %4d" % (done, total))
        for z in range(zoom_levels):
            tasks = []
            pieces = masterside // current_area
            x_off = masterpos[0] * pieces
            y_off = masterpos[1] * pieces
            for xp in range(pieces):
                for yp in range(pieces):
                    temp = pygame.Surface.subsurface(master_surface,
                                                     (xp * current_area, yp * current_area, current_area, current_area))
                    filename = "screen_{}_{}_{}.png".format(z + 1, x_off + xp, y_off + yp)
                    data = pygame.image.tostring(temp, "RGB")
                    tasks.append(processpool.submit(render_plate, data, tilefolder, temp.get_size(), side, filename))

            for task in tasks:
                task.result()
                done += 1
                loadingbar.set_progress(done * fraction, caption + " %4d of %4d" % (done, total))
            current_area //= 2
    processpool.shutdown()
 def post(self):
     file = self.request.files['file'][0]
     hark.client.login()
     hark.client.createSession(default_hark_config)
     log.info("Uploading asynchrounously")
     pool = ProcessPoolExecutor(max_workers=2)
     future = pool.submit(async_upload, file)
     yield future
     pool.shutdown()
     log.info("Rendering visualization page")
     self.render('visualize.html')
Example #5
0
class ConcurrentDownloader(BaseDownloader, ConcurrentMixin):
    """Concurrent ProcessPoolExecutor downloader

    :param pool_size: size of ThreadPoolExecutor
    :param timeout: request timeout in seconds
    """
    def __init__(
            self, worker_class,
            worker_kwargs=None, pool_size=5, middlewares=None,):

        # configure executor
        self.pool_size = pool_size
        self.executor = ProcessPoolExecutor(max_workers=self.pool_size)

        # prepare worker params
        self.worker_params = {
            'worker_class': worker_class,
            'worker_kwargs': worker_kwargs or {},
        }

        # ctrl-c support for python2.x
        # trap sigint
        signal.signal(signal.SIGINT, lambda s, f: s)

        super(ConcurrentDownloader, self).__init__(
            middlewares=middlewares
        )

    def get(self, requests):

        for request in requests:
            # delegate request processing to the executor
            future = self.executor.submit(
                _run_download_worker, self.worker_params, request,
            )

            # build Planned object
            done_future = Planned()

            # when executor finish request - fire done_future
            future.add_done_callback(
                partial(self._done, request, done_future)
            )

            yield done_future

    def get_workers_count(self):
        return self.pool_size

    def stop(self):
        self.executor.shutdown()
Example #6
0
def runParallelTqdm(func, arglist, workers=1):
    """Handle multiple tasks with tqdm bar in parallel.
       The function to be run must include keyword argument "vid",
       which should be passed to tqdm's position.

    Args:
        func (callable): The function you want to run in parallel
                        example: func(**kwarg, vid)
        arglist (dict/list of dict): arguments for specified function.
                        should be a list of keyword dictionaries.

        workers (int, optional): The number of processes run in parallel
                        At least 1, won't exceed the number of cpu cores.

    Returns:
        [list]: returns of your function in the same order of the arglist
    """
    if not isinstance(arglist, list):
        arglist = [arglist]
    workers = min(max(workers, 1), os.cpu_count())

    slotManager = Manager()
    opened = slotManager.list(range(workers - 1, -1, -1))
    filled = slotManager.dict()

    pb = tqdm(total=len(arglist),
              desc="Overall",
              leave=True,
              position=workers,
              ascii=(os.name == "nt"),
              unit="task",
              mininterval=0.2)

    executor = ProcessPoolExecutor(max_workers=workers)
    tasks = [
        executor.submit(_worker, func, args, opened, filled)
        for args in arglist
    ]

    for _ in as_completed(tasks):
        # Adjust Overall progress bar position
        if len(executor._pending_work_items) < workers:
            pb.clear()
            pb.pos = (-max(filled.values()) - 1) if filled else 0
        pb.refresh()
        pb.update(1)

    executor.shutdown(wait=True)
    pb.close()
    return [task.result() for task in tasks]
Example #7
0
async def main():
    executor = ProcessPoolExecutor(4,
                                   initializer=reader.initializer,
                                   initargs=(log, db_path, 'mainnet', 1.0,
                                             True))
    #await run_times(executor, 4, show=False)
    #await run_times(executor, 1)
    await run_times(executor, 2**3)
    await run_times(executor, 2**5)
    await run_times(executor, 2**7)
    #await run_times(executor, 2**9)
    #await run_times(executor, 2**11)
    #await run_times(executor, 2**13)
    executor.shutdown(True)
Example #8
0
def unzipfile(tofilepath):
    for root, dirs, files in os.walk(tofilepath):
        pool = ProcessPoolExecutor(max_workers=20)
        for file in files:
            #增加判断条件
            if file[-4:] == '.bz2':
                filefullname = os.path.join(root, file)
                unzipfile = filefullname[:-4]
                print unzipfile
                if not os.path.exists(unzipfile):
                    # 需要增加判断条件,如果解压文件存在的话就不解压了,
                    # 但是本身下面的命令会检验文件是否存在
                    obj = pool.submit(runshell, filefullname)
        pool.shutdown(wait=True)
Example #9
0
def monitor_points(filename, time2wait=1):
    '''
    Read a file with this format:
    ---
    ROOM
    monitor point
    --
    such as:

    Blumar/Sealand2/SM3B/Biofiltros/Biofiltros1/Oxygen/Saturation

    called:
                   |--> database  |--> room
    namespace = Blumar/Sealand2/SM3B/Biofiltros/Biofiltros1/Oxygen/Saturation
                        |_____________________________________________> monitor
    then 
    a dict is formed such as:
    {database|room: [monitor1, monitor2, monitorn]}

    then for each ROOM execute a thread to check if the room is stuck:
    goto: is_room_stuck
    '''
    fpt = open(filename,'rw')
    lines = fpt.readlines()
    all_threads = {}
    temp = []
    
    for data in lines:
        if data.__contains__('ROOM'):
            if len(temp) != 0:
                all_threads.update({database+"|"+room:temp})
                temp = []
        elif not data.__contains__('#'):
                data = data.rsplit('\n')[0]
                namespace = data.split("/")
                database = namespace[0]
                room = namespace[2]
                monitor = namespace[1]+"/"+namespace[2]+"/"+namespace[3]+"/"+namespace[4]+"/"+namespace[5]+"/"+namespace[6]
                temp.append(monitor)
    all_threads.update({database+"|"+room:temp})
    executor = ProcessPoolExecutor(max_workers=len(all_threads))
    tasks_results = []
    for k,v in all_threads.iteritems():
        task = executor.submit(is_room_stuck,k,v,time2wait)
        if task.result() is True:
            msg = 'Check ROOM %s , it seems to be off' % k
            send_email(room=k, mess=msg)

    executor.shutdown(wait=True)
Example #10
0
class Client:
    def __init__(self, dispatcher):
        self._dispatcher = dispatcher
        self._executor = ProcessPoolExecutor()

    def shutdown(self, wait):
        self._executor.shutdown(wait=wait)

    async def invoke(self, path, *args):
        return await asyncio.get_event_loop().run_in_executor(
            self._executor,
            self._dispatcher.invoke,
            path,
            *args
        )
Example #11
0
def main():
    start = time.perf_counter()

    processor = ProcessPoolExecutor(NUM_PROCESSES)
    futures = []

    def render(*args):
        futures.append(processor.submit(render_image, *args))

    for filename in generate_filenames(CORE, CORE_EXCLUDE):
        render(filename, 0.7, 0.1)

    # Misc STuff
    for filename in generate_filenames(MISC_STUFF):
        render(filename, 0.6, 0.05)

    # Base Entities
    for filename in generate_filenames(BASE_ENTITIES, ENTITY_EXCLUDE):
        render(filename, 0.7, 0.05)

    # Entites that need more color
    for filename in generate_filenames(BRIGHT_ENTITIES, ENTITY_EXCLUDE):
        render(filename, 0.7, 0.10)

    # Terrain
    for filename in generate_filenames(TERRAIN, TERRAIN_EXCLUDE):
        render(filename, 1, 0.4)

    # Ore
    for filename in generate_filenames(ORE, ORE_EXCLUDE):
        render(filename, 0.7, 0.2)

    # Wait for the all tasks to complete or the first one to raise an exception
    result = wait(futures, return_when=FIRST_EXCEPTION)

    # Cancel pending tasks after one failed with an exception
    for pending in result.not_done:
        pending.cancel()

    # Wait for processor to complete all pending tasks that could not be canceled.
    processor.shutdown()

    # Retrieve result for all tasks, this will re-raise the exception if the
    # task failed and cause it to be printed to the console
    for done in result.done:
        done.result()

    print(f"Done in {time.perf_counter() - start:.1f}s")
Example #12
0
def run(process_num, *filename):  # 输入多个n值,分成多个子任务来计算结果
    # 实例化进程池,process_num个进程
    executor = ProcessPoolExecutor(process_num)
    start = time.time()
    fs = []  # future列表
    print(filename[0])
    with open(filename[0], 'r') as f:
        for each_line in f.readlines():
            fs.append(
                executor.submit(weakfilescan,
                                each_line.replace(os.linesep, '')))
    wait(fs)  # 等待计算结束
    end = time.time()
    duration = end - start
    print("total cost: %.2fs" % duration)
    executor.shutdown()  # 销毁进程池
Example #13
0
def multi_process_submit(
    func,
    items,
    max_workers=10,
):
    executor = ProcessPoolExecutor(max_workers=max_workers)

    future_list = []
    for item in items:
        future_list.append(
            executor.submit(func, *item["args"],
                            **item["kwargs"])  # .add_done_callback()
        )
    done_iter = as_completed(future_list)
    executor.shutdown(wait=True)
    return done_iter
Example #14
0
def build_lut(cqa_train_data):
    print("Building lookup table for question and answer tokens")

    pool = ProcessPoolExecutor(max_workers=8)
    questions = list(pool.map(tokenize, cqa_train_data, chunksize=1000))
    pool.shutdown()
    print("Finished")

    maxlen = max([len(q) for q in questions])
    unique_tokens = set([t for q in questions for t in q])
    ques2idx = {word: idx + 1
                for idx, word in enumerate(unique_tokens)
                }  # save 0 for padding
    answers = set([q['answer'] for q in cqa_train_data])
    ans2idx = {ans: idx for idx, ans in enumerate(answers)}
    return ans2idx, ques2idx, maxlen
def run_experiments(net):
    csvfile = open(network_path[int(sys.argv[1])] + ".csv", 'a')
    logwriter = csv.writer(csvfile,
                           delimiter=',',
                           quotechar='|',
                           quoting=csv.QUOTE_MINIMAL)

    executor = ProcessPoolExecutor(max_workers=32)

    d_bound = shiva.max_degree(net)
    max_degree = shiva.max_degree(net)
    total_triangles = shiva.total_triangles(net)

    original_lps = [None] * 5
    for d in range(5):
        D = d_bound / (2**d)
        original_lps[d] = executor.submit(shiva.linear_program_solve, net, D)

    sample_lps = [[None] * 5] * 5
    for d in range(5):
        D = d_bound / (2**d)

        for k in range(5):
            p = 1 / (2**(k + 1))
            sample_lps[d][k] = experiment(executor, net, D, p)

    for d in range(5):
        D = d_bound / (2**d)

        for k in range(5):
            p = 1 / (2**(k + 1))
            if sample_lps[d][k] == -1:
                sample_lp = -1
            else:
                sample_lp = sum(x.result() for x in sample_lps[d][k]) \
                    / len(sample_lps[d][k])
            logwriter.writerow([
                max_degree,
                total_triangles,
                D,
                p,
                original_lps[d].result(),
                sample_lp,
                original_lps[d].result() / sample_lp,
            ])

    executor.shutdown(wait=True)
Example #16
0
    def _eval_proc(self, jc_entry: JobComponentEntry, train_output_file: str):
        job_detail = jc_entry.job['job_detail']
        eval_args = job_detail.get('eval_args',
                                   job_detail.get('evaluation_args'))
        if eval_args and job_detail.get('test_data_args'):
            try:
                if self.is_local:
                    eval_output_file = eval_args.get('output_file', '').strip()
                    if not eval_output_file:
                        comp_name = re.sub(r'\s+', '_', self.job_name)
                        eval_output_file = ComponentOutput.MODEL_EVALUATION_OUTPUT + "-" + comp_name + "_local"
                    ppool = ProcessPoolExecutor(1)
                    print("{}: start evaluation process".format(self.job_name))
                    ppool.submit(local_eval_func, self.job_name, jc_entry.job,
                                 jc_entry.pack_path, jc_entry.export_path,
                                 train_output_file, eval_output_file).result()
                    print("{}: evaluation process finished".format(
                        self.job_name))
                    ppool.shutdown(wait=True)
                else:
                    eval_output_file = eval_args.get('output_file', '').strip()
                    if not eval_output_file:
                        comp_name = re.sub(r'\s+', '_', self.job_name)
                        eval_output_file = ComponentOutput.MODEL_EVALUATION_OUTPUT + "-" + comp_name + "_tfjob"

                    eval_job = copy.deepcopy(jc_entry.job)
                    if eval_args.get('num_test_samples', 0) <= 0:
                        print(
                            "{}: 'num_test_samples' is not set, auto fallback to 1-worker evaluation"
                            .format(self.job_name))
                        eval_job['num_workers'] = 1
                    tfjob_launcher = EvalTFJobLauncher(
                        self.job_name + " Evaluation TFjob launcher",
                        args=[
                            "--job",
                            json.dumps(eval_job), "--pack-path",
                            jc_entry.pack_path, "--export-path",
                            jc_entry.export_path, "--output-file",
                            eval_output_file, "--upstream-output-file",
                            train_output_file
                        ])
                    tfjob_launcher.run()
            except Exception as e:
                print("{}: WARING: evaluation model failed: {}\n{}".format(
                    self.job_name, e, traceback.format_exc()))
        else:
            print("{}: skip evaluation step".format(self.job_name))
Example #17
0
def get_first_order(G):
    print("1st order: ")
    global EV, VE, EV_over_delta, VE_over_delta, node_nbr, node_degree

    EV = G.EV
    VE = G.VE
    EV_over_delta = G.EV_over_delta
    VE_over_delta = G.VE_over_delta
    node_nbr = G.node_nbr
    node_degree = G.node_degree

    processes_num = 80
    pool = ProcessPoolExecutor(max_workers=processes_num)
    process_list = []

    nodes = np.copy(G.nodes)

    split_num = min(processes_num, int(len(nodes) / 100)) + 1
    print("split_num", split_num)
    np.random.shuffle(nodes)
    nodes = np.array_split(nodes, split_num)

    print("Start get first order")
    for node in nodes:
        process_list.append(pool.submit(get_first_order_part, node))

    alias_n2n_1st = {}
    node2ff_1st = {}
    for p in as_completed(process_list):
        alias_t1, alias_t2 = p.result()
        alias_n2n_1st.update(alias_t1)
        node2ff_1st.update(alias_t2)

    pool.shutdown(wait=True)

    print("start turn dict to list")

    nodes = np.copy(G.nodes)

    alias_n2n_1st_list = [[] for n in nodes]
    node2ff_1st_list = [[] for n in nodes]

    for n in nodes:
        alias_n2n_1st_list[n] = alias_n2n_1st[n]
        node2ff_1st_list[n] = node2ff_1st[n]

    return alias_n2n_1st_list, node2ff_1st_list
Example #18
0
class LBRYSessionManager(SessionManager):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.query_executor = None
        self.websocket = None
        self.metrics = ServerLoadData()
        self.metrics_loop = None
        self.running = False
        if self.env.websocket_host is not None and self.env.websocket_port is not None:
            self.websocket = AdminWebSocket(self)
        self.search_cache = self.bp.search_cache
        self.search_cache['search'] = lrucache(10000)
        self.search_cache['resolve'] = lrucache(10000)

    async def process_metrics(self):
        while self.running:
            data = self.metrics.to_json_and_reset(
                {'sessions': self.session_count()})
            if self.websocket is not None:
                self.websocket.send_message(data)
            await asyncio.sleep(1)

    async def start_other(self):
        self.running = True
        args = dict(initializer=reader.initializer,
                    initargs=(self.logger, 'claims.db', self.env.coin.NET,
                              self.env.database_query_timeout,
                              self.env.track_metrics))
        if self.env.max_query_workers is not None and self.env.max_query_workers == 0:
            self.query_executor = ThreadPoolExecutor(max_workers=1, **args)
        else:
            self.query_executor = ProcessPoolExecutor(
                max_workers=self.env.max_query_workers
                or max(os.cpu_count(), 4),
                **args)
        if self.websocket is not None:
            await self.websocket.start()
        if self.env.track_metrics:
            self.metrics_loop = asyncio.create_task(self.process_metrics())

    async def stop_other(self):
        self.running = False
        if self.env.track_metrics:
            self.metrics_loop.cancel()
        if self.websocket is not None:
            await self.websocket.stop()
        self.query_executor.shutdown()
def process(pixel_path, base_data_dir, session):
    conn = pg.open(CONFIG["Database"]["URI"])
    print("Query Images.....")
    prep_stmt = conn.query(
        "SELECT snapshot_id, detection_id, runguid::text, imagepath, view_matrix, proj_matrix, handle, pos::bytea, rot::bytea, bbox,"
        "ngv_box3dpolygon(bbox3d)::bytea as fullbox,"
        "ST_MakePoint(ST_XMin(bbox3d), ST_YMin(bbox3d), ST_ZMin(bbox3d))::bytea as bbox3d_min,"
        "ST_MakePoint(ST_XMax(bbox3d), ST_YMax(bbox3d), ST_ZMax(bbox3d))::bytea as bbox3d_max FROM detections JOIN snapshots USING (snapshot_id) JOIN runs USING (run_id) JOIN sessions USING(session_id)"
        "WHERE session_id=$1 and processed=false and camera_pos <-> pos < 200 order by snapshot_id desc", session)

    pbar = ProgressBar(max_value=len(prep_stmt)).start()
    i = 0
    sem = Semaphore(100)
    pool = ProcessPoolExecutor(100)
    results = []
    conn.close()
    lck = Lock()
    def on_done(snapshot_id, x):
        result = x.result()
        sem.release()
        with lck:
            nonlocal i
            nonlocal results
            pbar.update(i)
            i += 1
            #if result is None: return
            #results.append((snapshot_id, x.result()[0], x.result()[1]))
            upload([(snapshot_id, result[0], result[1])], Path(pixel_path))
        
    last_id = 0
    for snapshot_id, detections in groupby(prep_stmt, key=lambda x: x['snapshot_id']):
        sem.acquire()
        detections = list(detections)
        last_id = snapshot_id
        #on_done(snapshot_id, process_detections(base_data_dir, detections))
        result = pool.submit(process_detections, base_data_dir, detections)
        result.add_done_callback(partial(on_done, snapshot_id))
    pool.shutdown(wait=True)
    pbar.finish()

    conn = pg.open(CONFIG["Database"]["URI"])
    conn.query("UPDATE snapshots set processed=false where snapshot_id=$1", last_id)
    conn.close()
    # print(results)

    return results
Example #20
0
class ProcessPoolOpInvoker(ModelOpInvoker):
    def __init__(self, model, func, n_jobs, persist_method):
        if isinstance(model, PersistedModel):
            key = model
        else:
            key = persist(model, method=persist_method)
        ctx = LKContext.INSTANCE
        _log.info('setting up ProcessPoolExecutor w/ %d workers', n_jobs)
        kid_tc = proc_count(level=1)
        self.executor = ProcessPoolExecutor(n_jobs, ctx, _initialize_mp_worker,
                                            (key, func, kid_tc, log_queue()))

    def map(self, *iterables):
        return self.executor.map(_mp_invoke_worker, *iterables)

    def shutdown(self):
        self.executor.shutdown()
Example #21
0
class ConanFitService:
    @inject
    def __init__(self, default_params_factory: ConanParamsFactory) -> None:
        self._executor = ProcessPoolExecutor(max_workers=1)
        self._default_params_factory = default_params_factory

    def fit(self, data: np.ndarray, params: Optional[ConanFitParams] = None):
        params = params or self._default_params_factory.create()
        params_dict = {
            'baseline': params.baseline,
        }
        cfut = self._executor.submit(contact_angle_fit, data, **params_dict)
        fut = asyncio.wrap_future(cfut, loop=asyncio.get_event_loop())
        return fut

    def destroy(self) -> None:
        self._executor.shutdown()
Example #22
0
def albumSpider():
    print("======= 开始爬 专辑 信息 ===========")
    startTime = datetime.datetime.now()
    print(startTime.strftime('%Y-%m-%d %H:%M:%S'))
    # 所有歌手数量
    artists_num = sql.get_all_artist_num()
    # 批次
    batch = math.ceil(artists_num.get('num') / 1000.0)
    # 构建线程池
    pool = ProcessPoolExecutor(3)
    for index in range(0, batch):
        pool.submit(saveAlbumBatch, index)
    pool.shutdown(wait=True)
    print("======= 结束爬 专辑 信息 ===========")
    endTime = datetime.datetime.now()
    print(endTime.strftime('%Y-%m-%d %H:%M:%S'))
    print("耗时:", (endTime - startTime).seconds, "秒")
Example #23
0
def cat_Lable_Cnt_Fun(train_data, y, test_data, config):
    timer = Timer()
    cat_feature_list = [
        c for c in train_data if c.startswith(CONSTANT.CATEGORY_PREFIX)
    ]
    if len(cat_feature_list) == 0: return None

    # train_data_length = len(train_data)
    train_data[LABEL] = y

    row_sp = int(np.ceil((len(train_data) + len(test_data)) / 1000000))
    col_sp = int(np.ceil(len(cat_feature_list) / 20))
    sp = row_sp * col_sp
    print(
        f' **** We should split it as {sp}, {col_sp}-{row_sp} sp to process! ****'
    )
    cols_split = np.array_split(cat_feature_list, sp)
    data_list = []
    for i, cols in enumerate(cols_split):
        if len(cols) >= 1:
            pool = ProcessPoolExecutor(4)
            result_list = pool.map(cat_Lable_Cnt_Fun_sub, [[
                train_data[[col, LABEL]], test_data[[col]], col,
                config['pos_rate'], config[CONSTANT.TRAIN_LEN_OF_TRAIN_VAL]
            ] for col in cols])

            pool.shutdown(wait=True)

            for i_data in result_list:
                if i_data is not None:
                    data_list += i_data

            print(f'{i} split successful')

    # feature_data = pd.concat(data_list, axis=1, copy=False)
    # feature_data.columns = name_list
    # timer.check("label count map done")
    # del data_list
    # gc.collect()

    test_data.drop(cat_feature_list, axis=1, inplace=True)
    cat_feature_list += [LABEL]
    train_data.drop(cat_feature_list, axis=1, inplace=True)
    timer.check("drop")

    return data_list
Example #24
0
class ProcessPoolParallelizer(Parallelizer):
    """A Parallelizer based on concurrent.futures.ProcessPoolExecutor."""
    def __init__(self, options):
        options.set_smart_defaults(num_tasks=default_num_tasks)
        if sys.version_info >= (3, 8, 0) and sys.platform != 'win32':
            ctx = get_context('fork')
            self.pool = ProcessPoolExecutor(options.num_tasks, mp_context=ctx, initializer=process_initializer)
        else:
            self.pool = ProcessPoolExecutor(options.num_tasks, initializer=process_initializer)

        self.process_func = partial(evaluate_step, options=options)

    def solve_circuits_parallel(self, tuples):
        return self.pool.map(self.process_func, tuples)

    def done(self):
        self.pool.shutdown()
Example #25
0
def parse_page_data(futures):
    result = futures.result()
    data = result[0]
    next_page_url = result[1]

    if next_page_url:
        handler = page_pool.submit(down_load_page_data, next_page_url)
        handler.add_done_callback(parse_page_data)

    page = data['page']
    html = data['data']
    # 创建进程池(获取活动详情的页面源码)
    detail_pool = ProcessPoolExecutor(2)

    if page == 1:
        print('解析第一页数据,静态页面')
        html_element = etree.HTML(html)
        hot_active = html_element.xpath('//div[@class="hot_detail fn-clear"]')
        for hot_div in hot_active:
            # 活动详情的url地址
            full_detail_url = 'http://date.jiayuan.com' + hot_div.xpath(
                './/h2[@class="hot_title"]/a/@href')[0]
            handler = detail_pool.submit(download_detail_data, full_detail_url)
            handler.add_done_callback(parse_detail_data)
        more_active = html_element.xpath(
            '//ul[@class="review_detail fn-clear t-activiUl"]/li')
        for more_li in more_active:
            # 活动详情的url地址
            full_detail_url = 'http://date.jiayuan.com' + more_li.xpath(
                './/a[@class="review_link"]/@href')[0]
    else:
        print('解析第' + str(page) + '数据', '非静态页面')
        # 使用json.loads()将json字符串转换为python数据类型
        json_obj = json.loads(html)
        if isinstance(data, list):
            # 是列表,说明得到的是正确的数据,
            print('正在解析数据')
            for sub_dict in json_obj:
                id = sub_dict['id']
                # http://date.jiayuan.com/activityreviewdetail.php?id=11706
                full_detail_url = 'http://date.jiayuan.com/activityreviewdetail.php?id=%s' % id
                handler = detail_pool.submit(download_detail_data,
                                             full_detail_url)
                handler.add_done_callback(parse_detail_data)
    detail_pool.shutdown()
Example #26
0
def preprocess_transition_probs(sg):
    '''
    Preprocessing of transition probabilities for guiding the random walks.
    '''
    global sG
    sG = sg
    G = sG.G
    is_directed = sG.is_directed

    print("transition probs: ")
    alias_nodes = {}
    for node in tqdm(G.nodes()):
        unnormalized_probs = [
            G[node][nbr]['weight'] / np.sqrt(sG.degree[nbr])
            for nbr in sG.neighbors[node]
        ]
        # unnormalized_probs = [G[node][nbr]['weight'] for nbr in sG.neighbors[node]]
        norm_const = sum(unnormalized_probs)
        normalized_probs = [
            float(u_prob) / norm_const for u_prob in unnormalized_probs
        ]
        alias_nodes[node] = alias_setup(normalized_probs)

    triads = {}

    # Parallel alias edges
    print("alias edges: ")
    edges = G.edges()

    threads_num = 100
    pool = ProcessPoolExecutor(max_workers=threads_num)
    process_list = []

    edges = np.array_split(edges, threads_num * 2)
    for e in edges:
        process_list.append(pool.submit(alias_some_edges, e))

    alias_edges = {}
    for p in as_completed(process_list):
        alias_t = p.result()
        alias_edges.update(alias_t)
    pool.shutdown(wait=True)

    sG.alias_nodes = alias_nodes
    sG.alias_edges = alias_edges
Example #27
0
def get_benchmark(paths, car_df, road_df, cross_df, process_num=4):
    """
    针对直接进行路径规划,假设不堵车的情况,得到理想情况下的运行时间
    使用多进程实现
    :param paths: 所有车规划出来的路径,数据格式:字典{carID: [edge path]}
    :param car_df:
    :param road_df:
    :return:  car_time_cost: 每个车的时间消耗{carID: time cost}
              all_time_cost: 所有车时间总消耗
    """
    car_time_cost = {}
    all_time_cost = 0

    carL = list(car_df['id'])
    carL_len = len(carL)

    # 为多进程进行分割数据
    N = int(carL_len / process_num)
    splice = [N * x for x in range(process_num)]
    splice.append(carL_len)

    # 启动多进程
    print('get_benchmark: ')
    try:
        p = ProcessPoolExecutor(max_workers=process_num)
        obj_l = []
        for st, ed in zip(splice[:-1], splice[1:]):
            obj = p.submit(__get_time_cost, paths, carL[st:ed], car_df,
                           road_df)
            obj_l.append(obj)

        p.shutdown(wait=True)

        # 将多进程得到的结果进行整合

        #    print([len(obj.result()) for obj in obj_l])
        for obj in obj_l:
            car_time_cost.update(obj.result()[0])
            all_time_cost += obj.result()[1]
    except:
        print("Multi-processing failed, using single processing now")
        car_time_cost, all_time_cost = __get_time_cost(paths, carL, car_df,
                                                       road_df)

    return car_time_cost, all_time_cost
Example #28
0
def main_1(snapshotList):
    voc = VOC_Generator(snapshotList[0].runguid, output_dir)
    voc.create_folders(voc)

    sem_tmp = Semaphore(2)
    pool_tmp = ProcessPoolExecutor(2)
    lck_tmp = Lock()

    def on_done_tmp(x):
        snapshotIDList = x.result()
        sem_tmp.release()
        with lck_tmp:
            voc.saveTrainTest(voc, snapshotIDList)

    sem_tmp.acquire()
    result_tmp = pool_tmp.submit(main_fn, snapshotList, voc)
    result_tmp.add_done_callback(partial(on_done_tmp))
    pool_tmp.shutdown(wait=True)
Example #29
0
    def test_concurrency(self):
        num_of_dependency_managers = 10
        executor = ProcessPoolExecutor(max_workers=num_of_dependency_managers)

        random_file_path = os.path.join(self.work_dir, "random_file")
        with open(random_file_path, "wb") as f:
            f.seek((1024 * 1024 * 1024) - 1)  # 1 GB
            f.write(b"\0")

        futures = [
            executor.submit(task, self.work_dir, self.state_path,
                            random_file_path)
            for _ in range(num_of_dependency_managers)
        ]
        for future in futures:
            print(future.result())
            self.assertIsNone(future.exception())
        executor.shutdown()
Example #30
0
 def _predict_proc(self, jc_entry: JobComponentEntry,
                   train_output_file: str):
     job_detail = jc_entry.job['job_detail']
     if job_detail.get('pred_data_args', job_detail.get('predict_data_args')) and \
             job_detail.get('pred_args', job_detail.get('predict_args')):
         try:
             ppool = ProcessPoolExecutor(1)
             print("{}: start prediction process".format(self.job_name))
             ppool.submit(predict_func, self.job_name, jc_entry.job,
                          jc_entry.pack_path, jc_entry.export_path,
                          train_output_file).result()
             print("{}: prediction process finished".format(self.job_name))
             ppool.shutdown(wait=True)
         except Exception as e:
             print("{}: WARING: prediction failed: {}\n{}".format(
                 self.job_name, e, traceback.format_exc()))
     else:
         print("{}: skip prediction step".format(self.job_name))
Example #31
0
def make_arch_db():
    executor = ProcessPoolExecutor(max_workers=8)
    by = 10000
    m = 60000000
    #by = 2000
    #m = 10000
    e = executor.map(process_range, zip(range(0, m, by),range(by, m+by, by)))
    executor.shutdown()
    print('done calculating architectures')
    pfam_sets = merge(e)
    print(len(pfam_sets))
    gsave(pfam_sets,'pfam_sets.pkl.gz')
    
    # mongodb
    db = MongoClient('wl-cmadmin', 27017).ArchDB_Pfam_071414.ArchDB_Pfam_071414
    db.insert(map(lambda item: {'_id': min(item[1]), 'pID': list(item[1]), 'Pfam': item[0]}, pfam_sets.items()))
    db.ensure_index('pID')
    db.ensure_index('Pfam')
Example #32
0
class LBRYSessionManager(SessionManager):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.query_executor = None

    async def start_other(self):
        args = dict(initializer=reader.initializer,
                    initargs=('claims.db', self.env.coin.NET))
        if self.env.max_query_workers is not None and self.env.max_query_workers == 0:
            self.query_executor = ThreadPoolExecutor(max_workers=1, **args)
        else:
            self.query_executor = ProcessPoolExecutor(
                max_workers=self.env.max_query_workers
                or max(os.cpu_count(), 4),
                **args)

    async def stop_other(self):
        self.query_executor.shutdown()
Example #33
0
    def handle_stream(self, stream, address):
        pool = ProcessPoolExecutor(max_workers=1)
        while True:
            try:
                # read protocol version
                protocol_ver = yield stream.read_bytes(MSG_LEN)
                protocol_ver = struct.unpack('>I', protocol_ver)[0]

                # Read message length (4 bytes) and unpack it into an integer
                raw_msg_length = yield stream.read_bytes(MSG_LEN)
                msg_length = struct.unpack('>I', raw_msg_length)[0]

                app_log.debug("Handle request (Protocol: v%d, Msg size: %d)",
                              protocol_ver, msg_length)

                data = yield stream.read_bytes(msg_length)
                msg = msgpack.unpackb(data,
                                      object_hook=decode_np_array,
                                      use_list=False,
                                      encoding='utf-8')

                try:
                    fut = pool.submit(handle_request, msg)
                    response = yield fut
                except Exception:
                    app_log.exception('Error in subprocess')
                    response = msgpack.packb(
                        {
                            'status': INTERNAL_SERVER_ERROR,
                        },
                        default=encode_np_array)

                yield stream.write(struct.pack('>I', PROTOCOL_VER))

                # Prefix each message with a 4-byte length (network byte order)
                yield stream.write(struct.pack('>I', len(response)))

                yield stream.write(response)
            except StreamClosedError:
                app_log.info("Lost client at host %s", address)
                break
            except Exception:
                app_log.exception('Error while handling client connection')
        pool.shutdown()
Example #34
0
def parallel_build_hash(data,
                        func,
                        args,
                        num,
                        initial=None,
                        compress=False,
                        max_size=-1):
    import multiprocessing
    cpu_num = multiprocessing.cpu_count()
    data = np.array_split(data, cpu_num * 1)
    dict1 = deepcopy(initial)
    pool = ProcessPoolExecutor(max_workers=cpu_num)
    process_list = []

    if func == 'build_hash':
        func = build_hash
    if func == 'build_hash2':
        func = build_hash2
    if func == 'build_hash3':
        func = build_hash3

    for datum in data:
        process_list.append(pool.submit(func, datum, compress, max_size))

    for p in as_completed(process_list):
        a = p.result()
        if compress:
            dict1 = dict1.union(a)
        else:
            dict1.update(a)
        del a
    pool.shutdown(wait=True)

    # if args.data in ['schic','ramani']:
    # 	print (num[0])
    # 	new_list_of_set = [set() for i in range(int(num[0]+1))]
    # 	for s in dict1:
    # 		try:
    # 			new_list_of_set[s[0]].add(s)
    # 		except:
    # 			print (s)
    # 			raise EOFError
    # 	dict1 = new_list_of_set
    return dict1
Example #35
0
def best_matching_hungarian(all_cors,
                            all_pids_info,
                            all_pids_fff,
                            track_vid_next_fid,
                            weights,
                            weights_fff,
                            num,
                            mag,
                            pool_size=5):
    x1, y1, x2, y2 = [all_cors[:, col] for col in range(4)]
    all_grades_details = []
    all_grades = []

    box1_num = len(all_pids_info)
    box2_num = track_vid_next_fid['num_boxes']
    cost_matrix = np.zeros((box1_num, box2_num))

    qsize = box1_num * track_vid_next_fid['num_boxes']
    pool = ProcessPoolExecutor(max_workers=pool_size)
    futures = []
    for pid1 in range(box1_num):
        box1_pos = all_pids_info[pid1]['box_pos']
        box1_region_ids = find_region_cors_last(box1_pos, all_cors)
        box1_score = all_pids_info[pid1]['box_score']
        box1_pose = all_pids_info[pid1]['box_pose_pos']
        box1_fff = all_pids_fff[pid1]

        for pid2 in range(1, track_vid_next_fid['num_boxes'] + 1):
            future = pool.submit(best_matching_hungarian_kernel, pid1, pid2,
                                 all_cors, track_vid_next_fid, weights,
                                 weights_fff, num, mag, box1_pos,
                                 box1_region_ids, box1_score, box1_pose,
                                 box1_fff)
            futures.append(future)

    pool.shutdown(True)
    for future in futures:
        pid1, pid2, grade = future.result()
        cost_matrix[pid1, pid2 - 1] = grade
    m = Munkres()
    indexes = m.compute((-np.array(cost_matrix)).tolist())

    return indexes, cost_matrix
Example #36
0
class ProcessPoolEvaluator(SubmitEvaluator):
    
    def __init__(self, processes=None):
        try:
            from concurrent.futures import ProcessPoolExecutor
            self.executor = ProcessPoolExecutor(processes)
            super(ProcessPoolEvaluator, self).__init__(self.executor.submit)
            LOGGER.log(logging.INFO, "Started process pool evaluator")
            
            if processes:
                LOGGER.log(logging.INFO, "Using user-defined number of processes: %d", processes)
        except ImportError:
            # prevent error from showing in Eclipse if concurrent.futures not available
            raise
        
    def close(self):
        LOGGER.log(logging.DEBUG, "Closing process pool evaluator")
        self.executor.shutdown()
        LOGGER.log(logging.INFO, "Closed process pool evaluator")
Example #37
0
    def test_executor(self):
        m = aioprocessing.AioManager()
        q = m.AioQueue()
        p = ProcessPoolExecutor(max_workers=1)
        val = 4
        def submit():
            yield p.submit(queue_put, q, val)
        next(submit())

        @asyncio.coroutine
        def queue_get():
            out = yield from q.coro_get()
            self.assertEqual(out, val)
            yield from q.coro_put(5)

        self.loop.run_until_complete(queue_get())
        returned = q.get()
        self.assertEqual(returned, 5)
        p.shutdown()
Example #38
0
def infer_all(db_name):
    db = pymongo.MongoClient('127.0.0.1', 27017, connect=False).get_database(db_name)
    executor = ProcessPoolExecutor(max_workers=10)

    futures = []
    for collection_name in db.collection_names():
        if not is_q_col(collection_name):
            continue
        tid = collection_name[:-2]
        q_collection = db[collection_name]
        a_collection = db[q_to_a(collection_name)]
        for q_doc in q_collection.find({}, {'qid':1, 'topic':1}):
            qid = q_doc['qid']
            aids = [a_doc['aid'] for a_doc in
                    a_collection.find({'qid': qid}, {'aid': 1})]
            futures.append(
                executor.submit(infer_question_task(db_name, tid, qid, aids))
            )

    executor.shutdown()
Example #39
0
    def execute_parallel(self, executor=None, loop=None):

        if executor is None:
            executor = ProcessPoolExecutor()
            shut_executor = True
        else:
            shut_executor = False

        if loop is None:
            loop = asyncio.get_event_loop()

        deps = self.graph.dependency_resolver()
        next_specs = deps.send(None)


        task = loop.create_task(self.submit_next_specs(loop, executor,
                                                    next_specs, deps))
        loop.run_until_complete(task)

        if shut_executor:
            executor.shutdown()
Example #40
0
def main(argv=None):
    usage = """REDCap Data Model Generator

    Usage:
        redcap dball <version> [--dir=DIR] [--db=DB] [--host=HOST] [--port=PORT] [--user=USER] [--pass=PASS]

    Options:
        -h --help       Show this screen.
        --dir=DIR       Name of the directory to output the files [default: .].
        --db=DB         Name of the REDCap database [default: redcap].
        --host=HOST     Host of the database server [default: localhost].
        --port=PORT     Port of the database server [default: 3306].
        --user=USER     Username to connect with.
        --pass=PASS     Password to connect with. If set to *, a prompt will be provided.
        --procs=PROCS   Number of processes to spawn [default: 24].

    """  # noqa

    from docopt import docopt

    args = docopt(usage, argv=argv, version='0.1')

    if args['--pass'] == '*':
        args['--pass'] = getpass('password: '******'--db'],
                      args['--host'],
                      args['--port'],
                      args['--user'],
                      args['--pass'])

    project_names = db_projects(conn)

    pool = ProcessPoolExecutor(max_workers=int(args['--procs']))

    for name in project_names:
        pool.submit(worker, name, args)

    pool.shutdown()
Example #41
0
def infer_many(db_name, filename):
    """
    推断一些问题的回答, 读取文件, 每一行格式为
    topic,qid,...(后面是什么无所谓)
    """
    db = pymongo.MongoClient('127.0.0.1', 27017, connect=False).get_database(db_name)
    executor = ProcessPoolExecutor(max_workers=5)

    count = 0
    futures = []
    with open(filename) as f:
        for line in f:
            tid, qid, _ = line.split(',', maxsplit=2)
            a_collection = db[a_col(tid)]
            aids = [a_doc['aid'] for a_doc in
                    a_collection.find({'qid': qid}, {'aid': 1})]
            futures.append(
                executor.submit(infer_question_task, db_name, tid, qid, aids)
            )
            count += len(aids)

    print(count)
    executor.shutdown()
def Main():
  global gSymFileManager, gOptions, gPool

  if not ReadConfigFile():
    return 1

  # In a perfect world, we could create a process per cpu core.
  # But then we'd have to deal with cache sharing
  gPool = Pool(1)
  gPool.submit(initializeSubprocess, gOptions)

  # Setup logging in the parent process.
  # Ensure this is called after the call to initializeSubprocess to
  # avoid duplicate messages in Unix systems.
  SetLoggingOptions(gOptions["Log"])

  LogMessage("Starting server with the following options:\n" + str(gOptions))

  app = Application([
    url(r'/(debug)', DebugHandler),
    url(r'/(nodebug)', DebugHandler),
    url(r"(.*)", SymbolHandler)])

  app.listen(gOptions['portNumber'], gOptions['hostname'])

  try:
    # select on Windows doesn't return on ctrl-c, add a periodic
    # callback to make ctrl-c responsive
    if sys.platform == 'win32':
        PeriodicCallback(lambda: None, 100).start()
    IOLoop.current().start()
  except KeyboardInterrupt:
    LogMessage("Received SIGINT, stopping...")

  gPool.shutdown()
  LogMessage("Server stopped - " + gOptions['hostname'] + ":" + str(gOptions['portNumber']))
  return 0
Example #43
0
def run_in_process(sync_fn, *args):
    pool = ProcessPoolExecutor(max_workers=1)
    result = yield pool.submit(sync_fn, *args)
    pool.shutdown()
    return result
Example #44
0
class BokehTornado(TornadoApplication):
    ''' A Tornado Application used to implement the Bokeh Server.

        The Server class is the main public interface, this class has
        Tornado implementation details.

    Args:
        applications (dict of str : bokeh.application.Application) : map from paths to Application instances
            The application is used to create documents for each session.
        extra_patterns (seq[tuple]) : tuples of (str, http or websocket handler)
            Use this argument to add additional endpoints to custom deployments
            of the Bokeh Server.
        prefix (str) : a URL prefix to use for all Bokeh server paths
        secret_key (str) : secret key for signing session IDs
        sign_sessions (boolean) : whether to sign session IDs
        generate_session_ids (boolean) : whether to generate a session ID when none is provided
        extra_websocket_origins (list) : hosts that can connect to the websocket
        keep_alive_milliseconds (int) : number of milliseconds between keep-alive pings
            Set to 0 to disable pings. Pings keep the websocket open.
        check_unused_sessions_milliseconds (int) : number of milliseconds between check for unused sessions
        unused_session_lifetime_milliseconds (int) : number of milliseconds for unused session lifetime
        stats_log_frequency_milliseconds (int) : number of milliseconds between logging stats
        use_index (boolean) : True to generate an index of the running apps in the RootHandler

    '''

    def __init__(self, applications,
                 prefix,
                 extra_websocket_origins,
                 extra_patterns=None,
                 secret_key=settings.secret_key_bytes(),
                 sign_sessions=settings.sign_sessions(),
                 generate_session_ids=True,
                 # heroku, nginx default to 60s timeout, so well less than that
                 keep_alive_milliseconds=37000,
                 # how often to check for unused sessions
                 check_unused_sessions_milliseconds=17000,
                 # how long unused sessions last
                 unused_session_lifetime_milliseconds=15000,
                 # how often to log stats
                 stats_log_frequency_milliseconds=15000,
                 use_index=True,
                 redirect_root=True):

        self._prefix = prefix
        self.use_index = use_index

        if keep_alive_milliseconds < 0:
            # 0 means "disable"
            raise ValueError("keep_alive_milliseconds must be >= 0")

        if check_unused_sessions_milliseconds <= 0:
            raise ValueError("check_unused_sessions_milliseconds must be > 0")

        if unused_session_lifetime_milliseconds <= 0:
            raise ValueError("check_unused_sessions_milliseconds must be > 0")

        if stats_log_frequency_milliseconds <= 0:
            raise ValueError("stats_log_frequency_milliseconds must be > 0")

        self._websocket_origins = set(extra_websocket_origins)
        self._secret_key = secret_key
        self._sign_sessions = sign_sessions
        self._generate_session_ids = generate_session_ids

        log.debug("These host origins can connect to the websocket: %r", list(self._websocket_origins))

        # Wrap applications in ApplicationContext
        self._applications = dict()
        for k,v in applications.items():
            self._applications[k] = ApplicationContext(v)

        extra_patterns = extra_patterns or []
        all_patterns = []
        for key, app in applications.items():
            app_patterns = []
            for p in per_app_patterns:
                if key == "/":
                    route = p[0]
                else:
                    route = key + p[0]
                route = self._prefix + route
                app_patterns.append((route, p[1], { "application_context" : self._applications[key] }))

            websocket_path = None
            for r in app_patterns:
                if r[0].endswith("/ws"):
                    websocket_path = r[0]
            if not websocket_path:
                raise RuntimeError("Couldn't find websocket path")
            for r in app_patterns:
                r[2]["bokeh_websocket_path"] = websocket_path

            all_patterns.extend(app_patterns)

            # add a per-app static path if requested by the application
            if app.static_path is not None:
                if key == "/":
                    route = "/static/(.*)"
                else:
                    route = key + "/static/(.*)"
                route = self._prefix + route
                all_patterns.append((route, StaticFileHandler, { "path" : app.static_path }))

        for p in extra_patterns + toplevel_patterns:
            if p[1] == RootHandler:
                if self.use_index:
                    data = {"applications": self._applications,
                            "prefix": self._prefix,
                            "use_redirect": redirect_root}
                    prefixed_pat = (self._prefix + p[0],) + p[1:] + (data,)
                    all_patterns.append(prefixed_pat)
            else:
                prefixed_pat = (self._prefix + p[0],) + p[1:]
                all_patterns.append(prefixed_pat)

        log.debug("Patterns are:")
        for line in pformat(all_patterns, width=60).split("\n"):
            log.debug("  " + line)

        super(BokehTornado, self).__init__(all_patterns)

    def initialize(self,
                 io_loop,
                 keep_alive_milliseconds=37000,
                 # how often to check for unused sessions
                 check_unused_sessions_milliseconds=17000,
                 # how long unused sessions last
                 unused_session_lifetime_milliseconds=15000,
                 # how often to log stats
                 stats_log_frequency_milliseconds=15000,
                 **kw):

        self._loop = io_loop

        for app_context in self._applications.values():
            app_context._loop = self._loop

        self._clients = set()
        self._executor = ProcessPoolExecutor(max_workers=4)
        self._stats_job = PeriodicCallback(self.log_stats,
                                           stats_log_frequency_milliseconds,
                                           io_loop=self._loop)
        self._unused_session_linger_milliseconds = unused_session_lifetime_milliseconds
        self._cleanup_job = PeriodicCallback(self.cleanup_sessions,
                                             check_unused_sessions_milliseconds,
                                             io_loop=self._loop)

        if keep_alive_milliseconds > 0:
            self._ping_job = PeriodicCallback(self.keep_alive, keep_alive_milliseconds, io_loop=self._loop)
        else:
            self._ping_job = None

    @property
    def app_paths(self):
        return set(self._applications)

    @property
    def io_loop(self):
        return self._loop

    @property
    def websocket_origins(self):
        return self._websocket_origins

    @property
    def secret_key(self):
        return self._secret_key

    @property
    def sign_sessions(self):
        return self._sign_sessions

    @property
    def generate_session_ids(self):
        return self._generate_session_ids

    def resources(self, absolute_url=None):
        if absolute_url:
            return Resources(mode="server", root_url=absolute_url + self._prefix, path_versioner=StaticHandler.append_version)
        return Resources(mode="server", root_url=self._prefix, path_versioner=StaticHandler.append_version)

    def start(self):
        ''' Start the Bokeh Server application.
        '''
        self._stats_job.start()
        self._cleanup_job.start()
        if self._ping_job is not None:
            self._ping_job.start()

        for context in self._applications.values():
            context.run_load_hook()

    def stop(self, wait=True):
        ''' Stop the Bokeh Server application.

        Args:
            wait (boolean): whether to wait for orderly cleanup (default: True)

        Returns:
            None

        '''
        # TODO we should probably close all connections and shut
        # down all sessions here
        for context in self._applications.values():
            context.run_unload_hook()

        self._stats_job.stop()
        self._cleanup_job.stop()
        if self._ping_job is not None:
            self._ping_job.stop()

        self._executor.shutdown(wait=wait)
        self._clients.clear()

    @property
    def executor(self):
        return self._executor

    def new_connection(self, protocol, socket, application_context, session):
        connection = ServerConnection(protocol, socket, application_context, session)
        self._clients.add(connection)
        return connection

    def client_lost(self, connection):
        self._clients.discard(connection)
        connection.detach_session()

    def get_session(self, app_path, session_id):
        if app_path not in self._applications:
            raise ValueError("Application %s does not exist on this server" % app_path)
        return self._applications[app_path].get_session(session_id)

    def get_sessions(self, app_path):
        if app_path not in self._applications:
            raise ValueError("Application %s does not exist on this server" % app_path)
        return list(self._applications[app_path].sessions)

    @gen.coroutine
    def cleanup_sessions(self):
        for app in self._applications.values():
            yield app.cleanup_sessions(self._unused_session_linger_milliseconds)
        raise gen.Return(None)

    def log_stats(self):
        if log.getEffectiveLevel() > logging.DEBUG:
            # avoid the work below if we aren't going to log anything
            return
        log.debug("[pid %d] %d clients connected", os.getpid(), len(self._clients))
        for app_path, app in self._applications.items():
            sessions = list(app.sessions)
            unused_count = 0
            for s in sessions:
                if s.connection_count == 0:
                    unused_count += 1
            log.debug("[pid %d]   %s has %d sessions with %d unused",
                      os.getpid(), app_path, len(sessions), unused_count)

    def keep_alive(self):
        for c in self._clients:
            c.send_ping()

    @gen.coroutine
    def run_in_background(self, _func, *args, **kwargs):
        """
        Run a synchronous function in the background without disrupting
        the main thread. Useful for long-running jobs.
        """
        res = yield self._executor.submit(_func, *args, **kwargs)
        raise gen.Return(res)
Example #45
0
class DataRouter(object):
    DEFAULT_PROJECT_NAME = "default"

    def __init__(self, config, component_builder):
        self._training_processes = config['max_training_processes'] if config['max_training_processes'] > 0 else 1
        self.config = config
        self.responses = self._create_query_logger(config)
        self.model_dir = config['path']
        self.token = config['token']
        self.emulator = self._create_emulator()
        self.component_builder = component_builder if component_builder else ComponentBuilder(use_cache=True)
        self.project_store = self._create_project_store()
        self.pool = ProcessPool(self._training_processes)

    def __del__(self):
        """Terminates workers pool processes"""
        self.pool.shutdown()

    def _create_query_logger(self, config):
        """Creates a logger that will persist incoming queries and their results."""

        response_log_dir = config['response_log']
        # Ensures different log files for different processes in multi worker mode
        if response_log_dir:
            # We need to generate a unique file name, even in multiprocess environments
            timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
            log_file_name = "rasa_nlu_log-{}-{}.log".format(timestamp, os.getpid())
            response_logfile = os.path.join(response_log_dir, log_file_name)
            # Instantiate a standard python logger, which we are going to use to log requests
            utils.create_dir_for_file(response_logfile)
            query_logger = Logger(observer=jsonFileLogObserver(io.open(response_logfile, 'a', encoding='utf8')),
                                  namespace='query-logger')
            # Prevents queries getting logged with parent logger --> might log them to stdout
            logger.info("Logging requests to '{}'.".format(response_logfile))
            return query_logger
        else:
            # If the user didn't provide a logging directory, we wont log!
            logger.info("Logging of requests is disabled. (No 'request_log' directory configured)")
            return None

    def _create_project_store(self):
        projects = []

        if os.path.isdir(self.config['path']):
            projects = os.listdir(self.config['path'])

        project_store = {}

        for project in projects:
            project_store[project] = Project(self.config, self.component_builder, project)

        if not project_store:
            project_store[self.DEFAULT_PROJECT_NAME] = Project()
        return project_store

    def _create_emulator(self):
        """Sets which NLU webservice to emulate among those supported by Rasa"""

        mode = self.config['emulate']
        if mode is None:
            from rasa_nlu.emulators import NoEmulator
            return NoEmulator()
        elif mode.lower() == 'wit':
            from rasa_nlu.emulators.wit import WitEmulator
            return WitEmulator()
        elif mode.lower() == 'luis':
            from rasa_nlu.emulators.luis import LUISEmulator
            return LUISEmulator()
        elif mode.lower() == 'api':
            from rasa_nlu.emulators.api import ApiEmulator
            return ApiEmulator()
        else:
            raise ValueError("unknown mode : {0}".format(mode))

    def extract(self, data):
        return self.emulator.normalise_request_json(data)

    def parse(self, data):
        project = data.get("project") or self.DEFAULT_PROJECT_NAME
        model = data.get("model")

        if project not in self.project_store:
            projects = os.listdir(self.config['path'])
            if project not in projects:
                raise InvalidProjectError("No project found with name '{}'.".format(project))
            else:
                try:
                    self.project_store[project] = Project(self.config, self.component_builder, project)
                except Exception as e:
                    raise InvalidProjectError("Unable to load project '{}'. Error: {}".format(project, e))

        response, used_model = self.project_store[project].parse(data['text'], data.get('time', None), model)

        if self.responses:
            self.responses.info(user_input=response, project=project, model=used_model)
        return self.format_response(response)

    def format_response(self, data):
        return self.emulator.normalise_response_json(data)

    def get_status(self):
        # This will only count the trainings started from this process, if run in multi worker mode, there might
        # be other trainings run in different processes we don't know about.

        return {
            "available_projects": {name: project.as_dict() for name, project in self.project_store.items()}
        }

    def start_train_process(self, data, config_values):
        # type: (Text, Dict[Text, Any]) -> Deferred
        """Start a model training."""

        if PY3:
            f = tempfile.NamedTemporaryFile("w+", suffix="_training_data", delete=False, encoding="utf-8")
            f.write(data)
        else:
            f = tempfile.NamedTemporaryFile("w+", suffix="_training_data", delete=False)
            f.write(data.encode("utf-8"))
        f.close()
        # TODO: fix config handling
        _config = self.config.as_dict()
        for key, val in config_values.items():
            _config[key] = val
        _config["data"] = f.name
        train_config = RasaNLUConfig(cmdline_args=_config)

        project = _config.get("project")
        if not project:
            raise InvalidProjectError("Missing project name to train")
        elif project in self.project_store:
            if self.project_store[project].status == 1:
                raise AlreadyTrainingError
            else:
                self.project_store[project].status = 1
        elif project not in self.project_store:
            self.project_store[project] = Project(self.config, self.component_builder, project)
            self.project_store[project].status = 1

        def training_callback(model_path):
            model_dir = os.path.basename(os.path.normpath(model_path))
            self.project_store[project].update(model_dir)
            return model_dir

        def training_errback(failure):
            target_project = self.project_store.get(failure.value.failed_target_project)
            if target_project:
                target_project.status = 0
            return failure

        logger.debug("New training queued")

        result = self.pool.submit(do_train_in_worker, train_config)
        result = deferred_from_future(result)
        result.addCallback(training_callback)
        result.addErrback(training_errback)

        return result
Example #46
0
def main(vcf, covariates, formula, min_qual, min_genotype_qual, min_samples,
        weighted=False, as_vcf=False, exclude_nan=False, groups=None):
    #if weighted == "FALSE": weighted = False
    #else:
    #    weight_fn = {'log10': np.log10, 'log': np.log, 'GQ': np.array}[weighted]
    if covariates.endswith('.csv'):
        covariate_df = pd.read_csv(covariates, index_col=0)
    else:
        covariate_df = pd.read_table(covariates, index_col=0)
    covariate_df.index = [str(x) for x in covariate_df.index]
    gmatrix = {}

    if groups == 'covariance':
        assert op.isfile(vcf), ('need to iterate over vcf 2x')
        cov = get_covariance(_get_genotypes(vcf, min_qual,
                                min_genotype_qual, min_samples, as_vcf))
        groups = pd.DataFrame(cov, index=covariate_df.index,
                columns=covariate_df.index)
        print(groups)
        # NOTE: currently using GLS and a covariance matrix but we assume
        # a binary dependent variable so estimates are off.

    po = ProcessPoolExecutor(1)

    for i, (samples, genos, quals, variant) in enumerate(
            _get_genotypes(vcf, min_qual, min_genotype_qual, min_samples,
                           as_vcf)):
        if i == 0 and not samples is None:
            # make sure we have covariates for all samples in the vcf
            assert not set(samples).difference(covariate_df.index),\
                        set(samples).difference(covariate_df.index)
            covariate_df = covariate_df.ix[samples,:]
        covariate_df['genotype'] = genos

        if samples is None:
            if exclude_nan: continue
            res = {'OR': np.nan, 'pvalue': np.nan, 'z': np.nan, 'OR_CI':
                    (np.nan, np.nan), 'xtab': 'NA'}
        else:
            xtab_future = po.submit(xtab, formula, covariate_df)
            try:
                res = vcfassoc(formula, covariate_df, groups)
                gmatrix['{CHROM}:{POS}'.format(**variant)] = genos
            except np.linalg.linalg.LinAlgError:
                res = {'OR': np.nan, 'pvalue': np.nan, 'z': np.nan, 'OR_CI':
                        (np.nan, np.nan)}
            except statsmodels.tools.sm_exceptions.PerfectSeparationError:
                print("WARNING: perfect separation, too few samples(?)",
                      ": setting to -9: {CHROM}:{POS}".format(**variant),
                      file=sys.stderr)
                res = {}
                res['z'] = res['OR'] = np.nan
                res['pvalue'] = -9.0 # blech.
                res['OR_CI'] = np.nan, np.nan
                gmatrix['{CHROM}:{POS}'.format(**variant)] = genos
            except IndexError:
                continue
            res['xtab'] = xtab_future.result()
            #res['xtab'] = xtab(formula, covariate_df)
        print_result(res, variant, as_vcf, i)

    l1_regr(pd.DataFrame(gmatrix), covariate_df, formula)
    po.shutdown()
Example #47
0
class BokehTornado(TornadoApplication):
    ''' A Tornado Application used to implement the Bokeh Server.

        The Server class is the main public interface, this class has
        Tornado implementation details.

    Args:
        applications (dict of str : bokeh.application.Application) : map from paths to Application instances
            The application is used to create documents for each session.
        extra_patterns (seq[tuple]) : tuples of (str, http or websocket handler)
            Use this argument to add additional endpoints to custom deployments
            of the Bokeh Server.
        prefix (str) : a URL prefix to use for all Bokeh server paths
        hosts (list) : hosts that are valid values for the Host header
        secret_key (str) : secret key for signing session IDs
        sign_sessions (boolean) : whether to sign session IDs
        generate_session_ids (boolean) : whether to generate a session ID when none is provided
        extra_websocket_origins (list) : hosts that can connect to the websocket
            These are in addition to ``hosts``.
        keep_alive_milliseconds (int) : number of milliseconds between keep-alive pings
            Set to 0 to disable pings. Pings keep the websocket open.
        develop (boolean) : True for develop mode

    '''

    def __init__(self, applications, prefix, hosts,
                 extra_websocket_origins,
                 io_loop=None,
                 extra_patterns=None,
                 secret_key=settings.secret_key_bytes(),
                 sign_sessions=settings.sign_sessions(),
                 generate_session_ids=True,
                 # heroku, nginx default to 60s timeout, so well less than that
                 keep_alive_milliseconds=37000,
                 # how often to check for unused sessions
                 check_unused_sessions_milliseconds=17000,
                 # how long unused sessions last
                 unused_session_lifetime_milliseconds=60*30*1000,
                 # how often to log stats
                 stats_log_frequency_milliseconds=15000,
                 develop=False):

        self._prefix = prefix

        if io_loop is None:
            io_loop = IOLoop.current()
        self._loop = io_loop

        if keep_alive_milliseconds < 0:
            # 0 means "disable"
            raise ValueError("keep_alive_milliseconds must be >= 0")

        self._hosts = set(hosts)
        self._websocket_origins = self._hosts | set(extra_websocket_origins)
        self._resources = {}
        self._develop = develop
        self._secret_key = secret_key
        self._sign_sessions = sign_sessions
        self._generate_session_ids = generate_session_ids

        log.debug("Allowed Host headers: %r", list(self._hosts))
        log.debug("These host origins can connect to the websocket: %r", list(self._websocket_origins))

        # Wrap applications in ApplicationContext
        self._applications = dict()
        for k,v in applications.items():
            self._applications[k] = ApplicationContext(v, self._develop, self._loop)

        extra_patterns = extra_patterns or []
        all_patterns = []
        for key in applications:
            app_patterns = []
            for p in per_app_patterns:
                if key == "/":
                    route = p[0]
                else:
                    route = key + p[0]
                route = self._prefix + route
                app_patterns.append((route, p[1], { "application_context" : self._applications[key] }))

            websocket_path = None
            for r in app_patterns:
                if r[0].endswith("/ws"):
                    websocket_path = r[0]
            if not websocket_path:
                raise RuntimeError("Couldn't find websocket path")
            for r in app_patterns:
                r[2]["bokeh_websocket_path"] = websocket_path

            all_patterns.extend(app_patterns)

        for p in extra_patterns + toplevel_patterns:
            prefixed_pat = (self._prefix+p[0],) + p[1:]
            all_patterns.append(prefixed_pat)

        for pat in all_patterns:
            _whitelist(pat[1])

        log.debug("Patterns are: %r", all_patterns)

        super(BokehTornado, self).__init__(all_patterns)

        self._clients = set()
        self._executor = ProcessPoolExecutor(max_workers=4)
        self._loop.add_callback(self._start_async)
        self._stats_job = PeriodicCallback(self.log_stats,
                                           stats_log_frequency_milliseconds,
                                           io_loop=self._loop)
        self._unused_session_linger_seconds = unused_session_lifetime_milliseconds
        self._cleanup_job = PeriodicCallback(self.cleanup_sessions,
                                             check_unused_sessions_milliseconds,
                                             io_loop=self._loop)

        if keep_alive_milliseconds > 0:
            self._ping_job = PeriodicCallback(self.keep_alive, keep_alive_milliseconds, io_loop=self._loop)
        else:
            self._ping_job = None

    @property
    def io_loop(self):
        return self._loop

    @property
    def websocket_origins(self):
        return self._websocket_origins

    @property
    def secret_key(self):
        return self._secret_key

    @property
    def sign_sessions(self):
        return self._sign_sessions

    @property
    def generate_session_ids(self):
        return self._generate_session_ids

    def root_url_for_request(self, request):
        return request.protocol + "://" + request.host + self._prefix + "/"

    def websocket_url_for_request(self, request, websocket_path):
        # websocket_path comes from the handler, and already has any
        # prefix included, no need to add here
        protocol = "ws"
        if request.protocol == "https":
            protocol = "wss"
        return protocol + "://" + request.host + websocket_path

    def resources(self, request):
        root_url = self.root_url_for_request(request)
        if root_url not in self._resources:
            self._resources[root_url] =  Resources(mode="server",
                                                   root_url=root_url,
                                                   path_versioner=StaticHandler.append_version)
        return self._resources[root_url]

    def start(self, start_loop=True):
        ''' Start the Bokeh Server application main loop.

        Args:
            start_loop (boolean): False to not actually start event loop, used in tests

        Returns:
            None

        Notes:
            Keyboard interrupts or sigterm will cause the server to shut down.

        '''
        self._stats_job.start()
        self._cleanup_job.start()
        if self._ping_job is not None:
            self._ping_job.start()

        for context in self._applications.values():
            context.run_load_hook()

        if start_loop:
            try:
                self._loop.start()
            except KeyboardInterrupt:
                print("\nInterrupted, shutting down")

    def stop(self):
        ''' Stop the Bokeh Server application.

        Returns:
            None

        '''
        # TODO we should probably close all connections and shut
        # down all sessions either here or in unlisten() ... but
        # it isn't that important since in real life it's rare to
        # do a clean shutdown (vs. a kill-by-signal) anyhow.

        for context in self._applications.values():
            context.run_unload_hook()

        self._stats_job.stop()
        self._cleanup_job.stop()
        if self._ping_job is not None:
            self._ping_job.stop()

        self._loop.stop()

    @property
    def executor(self):
        return self._executor

    def new_connection(self, protocol, socket, application_context, session):
        connection = ServerConnection(protocol, socket, application_context, session)
        self._clients.add(connection)
        return connection

    def client_lost(self, connection):
        self._clients.discard(connection)
        connection.detach_session()

    def get_session(self, app_path, session_id):
        if app_path not in self._applications:
            raise ValueError("Application %s does not exist on this server" % app_path)
        return self._applications[app_path].get_session(session_id)

    def get_sessions(self, app_path):
        if app_path not in self._applications:
            raise ValueError("Application %s does not exist on this server" % app_path)
        return list(self._applications[app_path].sessions)

    @gen.coroutine
    def cleanup_sessions(self):
        for app in self._applications.values():
            yield app.cleanup_sessions(self._unused_session_linger_seconds)
        raise gen.Return(None)

    def log_stats(self):
        if log.getEffectiveLevel() > logging.DEBUG:
            # avoid the work below if we aren't going to log anything
            return
        log.debug("[pid %d] %d clients connected", os.getpid(), len(self._clients))
        for app_path, app in self._applications.items():
            sessions = list(app.sessions)
            unused_count = 0
            for s in sessions:
                if s.connection_count == 0:
                    unused_count += 1
            log.debug("[pid %d]   %s has %d sessions with %d unused",
                      os.getpid(), app_path, len(sessions), unused_count)

    def keep_alive(self):
        for c in self._clients:
            c.send_ping()

    @gen.coroutine
    def run_in_background(self, _func, *args, **kwargs):
        """
        Run a synchronous function in the background without disrupting
        the main thread. Useful for long-running jobs.
        """
        res = yield self._executor.submit(_func, *args, **kwargs)
        raise gen.Return(res)

    @gen.coroutine
    def _start_async(self):
        try:
            atexit.register(self._atexit)
            signal.signal(signal.SIGTERM, self._sigterm)
        except Exception:
            self.exit(1)

    _atexit_ran = False
    def _atexit(self):
        if self._atexit_ran:
            return
        self._atexit_ran = True

        self._stats_job.stop()
        IOLoop.clear_current()
        loop = IOLoop()
        loop.make_current()
        loop.run_sync(self._cleanup)

    def _sigterm(self, signum, frame):
        print("Received SIGTERM, shutting down")
        self.stop()
        self._atexit()

    @gen.coroutine
    def _cleanup(self):
        log.debug("Shutdown: cleaning up")
        self._executor.shutdown(wait=False)
        self._clients.clear()
Example #48
0
class RandomOptimizer(paths.PathOptimizer):
    """Base class for running any random path finder that benefits
    from repeated calling, possibly in a parallel fashion. Custom random
    optimizers should subclass this, and the ``setup`` method should be
    implemented with the following signature::

        def setup(self, inputs, output, size_dict):
            # custom preparation here ...
            return trial_fn, trial_args

    Where ``trial_fn`` itself should have the signature::

        def trial_fn(r, *trial_args):
            # custom computation of path here
            return ssa_path, cost, size

    Where ``r`` is the run number and could for example be used to seed a
    random number generator. See ``RandomGreedy`` for an example.


    Parameters
    ----------
    max_repeats : int, optional
        The maximum number of repeat trials to have.
    max_time : float, optional
        The maximum amount of time to run the algorithm for.
    minimize : {'flops', 'size'}, optional
        Whether to favour paths that minimize the total estimated flop-count or
        the size of the largest intermediate created.
    parallel : {bool, int, or executor-pool like}, optional
        Whether to parallelize the random trials, by default ``False``. If
        ``True``, use a ``concurrent.futures.ProcessPoolExecutor`` with the same
        number of processes as cores. If an integer is specified, use that many
        processes instead. Finally, you can supply a custom executor-pool which
        should have an API matching that of the python 3 standard library
        module ``concurrent.futures``. Namely, a ``submit`` method that returns
        ``Future`` objects, themselves with ``result`` and ``cancel`` methods.
    pre_dispatch : int, optional
        If running in parallel, how many jobs to pre-dispatch so as to avoid
        submitting all jobs at once. Should also be more than twice the number
        of workers to avoid under-subscription. Default: 128.

    Attributes
    ----------
    path : list[tuple[int]]
        The best path found so far.
    costs : list[int]
        The list of each trial's costs found so far.
    sizes : list[int]
        The list of each trial's largest intermediate size so far.

    See Also
    --------
    RandomGreedy
    """

    def __init__(self, max_repeats=32, max_time=None, minimize='flops', parallel=False, pre_dispatch=128):

        if minimize not in ('flops', 'size'):
            raise ValueError("`minimize` should be one of {'flops', 'size'}.")

        self.max_repeats = max_repeats
        self.max_time = max_time
        self.minimize = minimize
        self.better = paths.get_better_fn(minimize)
        self.parallel = parallel
        self.pre_dispatch = pre_dispatch

        self.costs = []
        self.sizes = []
        self.best = {'flops': float('inf'), 'size': float('inf')}

        self._repeats_start = 0

    @property
    def path(self):
        """The best path found so far.
        """
        return paths.ssa_to_linear(self.best['ssa_path'])

    @property
    def parallel(self):
        return self._parallel

    @parallel.setter
    def parallel(self, parallel):
        # shutdown any previous executor if we are managing it
        if getattr(self, '_managing_executor', False):
            self._executor.shutdown()

        self._parallel = parallel
        self._managing_executor = False

        if parallel is False:
            self._executor = None
            return

        if parallel is True:
            from concurrent.futures import ProcessPoolExecutor
            self._executor = ProcessPoolExecutor()
            self._managing_executor = True
            return

        if isinstance(parallel, numbers.Number):
            from concurrent.futures import ProcessPoolExecutor
            self._executor = ProcessPoolExecutor(parallel)
            self._managing_executor = True
            return

        # assume a pool-executor has been supplied
        self._executor = parallel

    def _gen_results_parallel(self, repeats, trial_fn, args):
        """Lazily generate results from an executor without submitting all jobs at once.
        """
        self._futures = deque()

        # the idea here is to submit at least ``pre_dispatch`` jobs *before* we
        # yield any results, then do both in tandem, before draining the queue
        for r in repeats:
            if len(self._futures) < self.pre_dispatch:
                self._futures.append(self._executor.submit(trial_fn, r, *args))
                continue
            yield self._futures.popleft().result()

        while self._futures:
            yield self._futures.popleft().result()

    def _cancel_futures(self):
        if self._executor is not None:
            for f in self._futures:
                f.cancel()

    def setup(self, inputs, output, size_dict):
        raise NotImplementedError

    def __call__(self, inputs, output, size_dict, memory_limit):
        # start a timer?
        if self.max_time is not None:
            t0 = time.time()

        trial_fn, trial_args = self.setup(inputs, output, size_dict)

        r_start = self._repeats_start + len(self.costs)
        r_stop = r_start + self.max_repeats
        repeats = range(r_start, r_stop)

        # create the trials lazily
        if self._executor is not None:
            trials = self._gen_results_parallel(repeats, trial_fn, trial_args)
        else:
            trials = (trial_fn(r, *trial_args) for r in repeats)

        # assess the trials
        for ssa_path, cost, size in trials:

            # keep track of all costs and sizes
            self.costs.append(cost)
            self.sizes.append(size)

            # check if we have found a new best
            found_new_best = self.better(cost, size, self.best['flops'], self.best['size'])

            if found_new_best:
                self.best['flops'] = cost
                self.best['size'] = size
                self.best['ssa_path'] = ssa_path

            # check if we have run out of time
            if (self.max_time is not None) and (time.time() > t0 + self.max_time):
                break

        self._cancel_futures()
        return self.path

    def __del__(self):
        # if we created the parallel pool-executor, shut it down
        if getattr(self, '_managing_executor', False):
            self._executor.shutdown()
Example #49
0
class MultiProcPlugin(DistributedPluginBase):
    """
    Execute workflow with multiprocessing, not sending more jobs at once
    than the system can support.

    The plugin_args input to run can be used to control the multiprocessing
    execution and defining the maximum amount of memory and threads that
    should be used. When those parameters are not specified,
    the number of threads and memory of the system is used.

    System consuming nodes should be tagged::

      memory_consuming_node.mem_gb = 8
      thread_consuming_node.n_procs = 16

    The default number of threads and memory are set at node
    creation, and are 1 and 0.25GB respectively.

    Currently supported options are:

    - non_daemon: boolean flag to execute as non-daemon processes
    - n_procs: maximum number of threads to be executed in parallel
    - memory_gb: maximum memory (in GB) that can be used at once.
    - raise_insufficient: raise error if the requested resources for
        a node over the maximum `n_procs` and/or `memory_gb`
        (default is ``True``).
    - scheduler: sort jobs topologically (``'tsort'``, default value)
        or prioritize jobs by, first, memory consumption and, second,
        number of threads (``'mem_thread'`` option).
    - mp_context: name of multiprocessing context to use

    """

    def __init__(self, plugin_args=None):
        # Init variables and instance attributes
        super(MultiProcPlugin, self).__init__(plugin_args=plugin_args)
        self._taskresult = {}
        self._task_obj = {}
        self._taskid = 0

        # Cache current working directory and make sure we
        # change to it when workers are set up
        self._cwd = os.getcwd()

        # Read in options or set defaults.
        self.processors = self.plugin_args.get('n_procs', mp.cpu_count())
        self.memory_gb = self.plugin_args.get(
            'memory_gb',  # Allocate 90% of system memory
            get_system_total_memory_gb() * 0.9)
        self.raise_insufficient = self.plugin_args.get('raise_insufficient',
                                                       True)

        # Instantiate different thread pools for non-daemon processes
        logger.debug('[MultiProc] Starting (n_procs=%d, '
                     'mem_gb=%0.2f, cwd=%s)',
                     self.processors, self.memory_gb, self._cwd)

        try:
            mp_context = mp.context.get_context(
                self.plugin_args.get('mp_context'))
            self.pool = ProcessPoolExecutor(max_workers=self.processors,
                                            initializer=os.chdir,
                                            initargs=(self._cwd,),
                                            mp_context=mp_context)
        except (AttributeError, TypeError):
            # Python < 3.7 does not support initialization or contexts
            self.pool = ProcessPoolExecutor(max_workers=self.processors)

        self._stats = None

    def _async_callback(self, args):
        # Make sure runtime is not left at a dubious working directory
        os.chdir(self._cwd)
        result = args.result()
        self._taskresult[result['taskid']] = result

    def _get_result(self, taskid):
        return self._taskresult.get(taskid)

    def _clear_task(self, taskid):
        del self._task_obj[taskid]

    def _submit_job(self, node, updatehash=False):
        self._taskid += 1

        # Don't allow streaming outputs
        if getattr(node.interface, 'terminal_output', '') == 'stream':
            node.interface.terminal_output = 'allatonce'

        result_future = self.pool.submit(run_node, node, updatehash, self._taskid)
        result_future.add_done_callback(self._async_callback)
        self._task_obj[self._taskid] = result_future

        logger.debug('[MultiProc] Submitted task %s (taskid=%d).',
                     node.fullname, self._taskid)
        return self._taskid

    def _prerun_check(self, graph):
        """Check if any node exeeds the available resources"""
        tasks_mem_gb = []
        tasks_num_th = []
        for node in graph.nodes():
            tasks_mem_gb.append(node.mem_gb)
            tasks_num_th.append(node.n_procs)

        if np.any(np.array(tasks_mem_gb) > self.memory_gb):
            logger.warning(
                'Some nodes exceed the total amount of memory available '
                '(%0.2fGB).', self.memory_gb)
            if self.raise_insufficient:
                raise RuntimeError('Insufficient resources available for job')

        if np.any(np.array(tasks_num_th) > self.processors):
            logger.warning(
                'Some nodes demand for more threads than available (%d).',
                self.processors)
            if self.raise_insufficient:
                raise RuntimeError('Insufficient resources available for job')

    def _postrun_check(self):
        self.pool.shutdown()

    def _check_resources(self, running_tasks):
        """
        Make sure there are resources available
        """
        free_memory_gb = self.memory_gb
        free_processors = self.processors
        for _, jobid in running_tasks:
            free_memory_gb -= min(self.procs[jobid].mem_gb, free_memory_gb)
            free_processors -= min(self.procs[jobid].n_procs, free_processors)

        return free_memory_gb, free_processors

    def _send_procs_to_workers(self, updatehash=False, graph=None):
        """
        Sends jobs to workers when system resources are available.
        """

        # Check to see if a job is available (jobs with all dependencies run)
        # See https://github.com/nipy/nipype/pull/2200#discussion_r141605722
        # See also https://github.com/nipy/nipype/issues/2372
        jobids = np.flatnonzero(~self.proc_done &
                                (self.depidx.sum(axis=0) == 0).__array__())

        # Check available resources by summing all threads and memory used
        free_memory_gb, free_processors = self._check_resources(
            self.pending_tasks)

        stats = (len(self.pending_tasks), len(jobids), free_memory_gb,
                 self.memory_gb, free_processors, self.processors)
        if self._stats != stats:
            tasks_list_msg = ''

            if logger.level <= INFO:
                running_tasks = [
                    '  * %s' % self.procs[jobid].fullname
                    for _, jobid in self.pending_tasks
                ]
                if running_tasks:
                    tasks_list_msg = '\nCurrently running:\n'
                    tasks_list_msg += '\n'.join(running_tasks)
                    tasks_list_msg = indent(tasks_list_msg, ' ' * 21)
            logger.info(
                '[MultiProc] Running %d tasks, and %d jobs ready. Free '
                'memory (GB): %0.2f/%0.2f, Free processors: %d/%d.%s',
                len(self.pending_tasks), len(jobids), free_memory_gb,
                self.memory_gb, free_processors, self.processors,
                tasks_list_msg)
            self._stats = stats

        if free_memory_gb < 0.01 or free_processors == 0:
            logger.debug('No resources available')
            return

        if len(jobids) + len(self.pending_tasks) == 0:
            logger.debug('No tasks are being run, and no jobs can '
                         'be submitted to the queue. Potential deadlock')
            return

        jobids = self._sort_jobs(
            jobids, scheduler=self.plugin_args.get('scheduler'))

        # Run garbage collector before potentially submitting jobs
        gc.collect()

        # Submit jobs
        for jobid in jobids:
            # First expand mapnodes
            if isinstance(self.procs[jobid], MapNode):
                try:
                    num_subnodes = self.procs[jobid].num_subnodes()
                except Exception:
                    traceback = format_exception(*sys.exc_info())
                    self._clean_queue(
                        jobid,
                        graph,
                        result={
                            'result': None,
                            'traceback': traceback
                        })
                    self.proc_pending[jobid] = False
                    continue
                if num_subnodes > 1:
                    submit = self._submit_mapnode(jobid)
                    if not submit:
                        continue

            # Check requirements of this job
            next_job_gb = min(self.procs[jobid].mem_gb, self.memory_gb)
            next_job_th = min(self.procs[jobid].n_procs, self.processors)

            # If node does not fit, skip at this moment
            if next_job_th > free_processors or next_job_gb > free_memory_gb:
                logger.debug('Cannot allocate job %d (%0.2fGB, %d threads).',
                             jobid, next_job_gb, next_job_th)
                continue

            free_memory_gb -= next_job_gb
            free_processors -= next_job_th
            logger.debug('Allocating %s ID=%d (%0.2fGB, %d threads). Free: '
                         '%0.2fGB, %d threads.', self.procs[jobid].fullname,
                         jobid, next_job_gb, next_job_th, free_memory_gb,
                         free_processors)

            # change job status in appropriate queues
            self.proc_done[jobid] = True
            self.proc_pending[jobid] = True

            # If cached and up-to-date just retrieve it, don't run
            if self._local_hash_check(jobid, graph):
                continue

            # updatehash and run_without_submitting are also run locally
            if updatehash or self.procs[jobid].run_without_submitting:
                logger.debug('Running node %s on master thread',
                             self.procs[jobid])
                try:
                    self.procs[jobid].run(updatehash=updatehash)
                except Exception:
                    traceback = format_exception(*sys.exc_info())
                    self._clean_queue(
                        jobid,
                        graph,
                        result={
                            'result': None,
                            'traceback': traceback
                        })

                # Release resources
                self._task_finished_cb(jobid)
                self._remove_node_dirs()
                free_memory_gb += next_job_gb
                free_processors += next_job_th
                # Display stats next loop
                self._stats = None

                # Clean up any debris from running node in main process
                gc.collect()
                continue

            # Task should be submitted to workers
            # Send job to task manager and add to pending tasks
            if self._status_callback:
                self._status_callback(self.procs[jobid], 'start')
            tid = self._submit_job(
                deepcopy(self.procs[jobid]), updatehash=updatehash)
            if tid is None:
                self.proc_done[jobid] = False
                self.proc_pending[jobid] = False
            else:
                self.pending_tasks.insert(0, (tid, jobid))
            # Display stats next loop
            self._stats = None

    def _sort_jobs(self, jobids, scheduler='tsort'):
        if scheduler == 'mem_thread':
            return sorted(
                jobids,
                key=lambda item: (self.procs[item].mem_gb, self.procs[item].n_procs)
            )
        return jobids
Example #50
0
def _parallel_execute(model_params):
    nb_cell = model_params['nb_cell']
    nb_time_step = model_params['nb_time_step']
    progress_desc = model_params['progress_desc']

    node_hierarchy = model_params['node_hierarchy']
    li_cell_up = model_params['li_cell_up']

    pool = ProcessPoolExecutor(max_workers=model_params['nworkers'])

    with tqdm(total=nb_cell,
              ascii=True, desc=progress_desc, unit=' cell') as pbar:
        ## Loop on cell hierarchy
        for lvl in range(len(node_hierarchy.keys())):
            futures = []
            for cell in node_hierarchy[lvl]:

                if cell == model_params['cell_external_flow']:
                    external_flow_flag = True
                else:
                    external_flow_flag = False

                if len(li_cell_up[cell]) > 0:
                    soil_upstream_inflow =    \
                               model_params['dset_Q_down'][1:, li_cell_up[cell]]
                    channel_upstream_inflow = \
                               model_params['dset_Qc_out'][1:, li_cell_up[cell]]
                else:
                    soil_upstream_inflow = [np.array([])
                                            for i in range(nb_time_step)]
                    channel_upstream_inflow = [np.array([])
                                               for i in range(nb_time_step)]

                ts_params = {
                 'cell': cell,
                 'nb_time_step': model_params['nb_time_step'],
                 'Vs_t0': model_params['Vs_t0'][cell],
                 'Vo_t0': model_params['Vo_t0'][cell],
                 'Vc_t0': model_params['Vc_t0'][cell],
                 'psi_b': model_params['psi_b'][cell],
                 'lamda': model_params['lamda'][cell],
                 'external_flow_flag': external_flow_flag,
                 'rainfall_forcing': model_params['rainfall_forcing'][:, cell],
                 'ETr_forcing': model_params['ETr_forcing'][:, cell],
                 'ET0_forcing': model_params['ET0_forcing'][:, cell],
                 'soil_upstream_inflow': soil_upstream_inflow,
                 'channel_upstream_inflow': channel_upstream_inflow,
                 'eff_theta': model_params['eff_theta'][cell],
                 'X': model_params['X'],
                 'W': model_params['W'][cell],
                 'Dt': model_params['Dt'],
                 'Xc': model_params['Xc'][cell],
                 'Kc': model_params['Kc'][cell],
                 'Ks': model_params['Ks'][cell],
                 'b_s': model_params['b_s'][cell],
                 'b_o': model_params['b_o'][cell],
                 'b_c': model_params['b_c'][cell],
                 'alpha_s': model_params['alpha_s'],
                 'alpha_o': model_params['alpha_o'],
                 'alpha_c': model_params['alpha_c'],
                 'solve_s': model_params['solve_s'],
                 'solve_o': model_params['solve_o'],
                 'solve_c': model_params['solve_c'],
                 'Vsm': model_params['Vsm'][cell],
                 'channel_flag': model_params['channel_flag'][cell],
                 'external_flow_records': model_params['external_flow_records']
                    }

                f = pool.submit(_solve_cell_timeseries, ts_params)
                f.add_done_callback(functools.partial(_cell_clean_up,
                                                      cell, pbar, model_params))

                futures.append(f)

            wait(futures)

    pool.shutdown()
Example #51
0
class DataRouter(object):
    def __init__(self,
                 project_dir=None,
                 max_training_processes=1,
                 response_log=None,
                 emulation_mode=None,
                 remote_storage=None,
                 component_builder=None,
                 model_server=None,
                 wait_time_between_pulls=None):
        self._training_processes = max(max_training_processes, 1)
        self._current_training_processes = 0
        self.responses = self._create_query_logger(response_log)
        self.project_dir = config.make_path_absolute(project_dir)
        self.emulator = self._create_emulator(emulation_mode)
        self.remote_storage = remote_storage
        self.model_server = model_server
        self.wait_time_between_pulls = wait_time_between_pulls

        if component_builder:
            self.component_builder = component_builder
        else:
            self.component_builder = ComponentBuilder(use_cache=True)

        self.project_store = self._create_project_store(project_dir)

        # tensorflow sessions are not fork-safe,
        # and training processes have to be spawned instead of forked. See
        # https://github.com/tensorflow/tensorflow/issues/5448#issuecomment
        # -258934405
        multiprocessing.set_start_method('spawn', force=True)

        self.pool = ProcessPool(self._training_processes)

    def __del__(self):
        """Terminates workers pool processes"""
        self.pool.shutdown()

    @staticmethod
    def _create_query_logger(response_log):
        """Create a logger that will persist incoming query results."""

        # Ensures different log files for different
        # processes in multi worker mode
        if response_log:
            # We need to generate a unique file name,
            # even in multiprocess environments
            timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
            log_file_name = "rasa_nlu_log-{}-{}.log".format(timestamp,
                                                            os.getpid())
            response_logfile = os.path.join(response_log, log_file_name)
            # Instantiate a standard python logger,
            # which we are going to use to log requests
            utils.create_dir_for_file(response_logfile)
            out_file = io.open(response_logfile, 'a', encoding='utf8')
            # noinspection PyTypeChecker
            query_logger = Logger(
                observer=jsonFileLogObserver(out_file, recordSeparator=''),
                namespace='query-logger')
            # Prevents queries getting logged with parent logger
            # --> might log them to stdout
            logger.info("Logging requests to '{}'.".format(response_logfile))
            return query_logger
        else:
            # If the user didn't provide a logging directory, we wont log!
            logger.info("Logging of requests is disabled. "
                        "(No 'request_log' directory configured)")
            return None

    def _collect_projects(self, project_dir: Text) -> List[Text]:
        if project_dir and os.path.isdir(project_dir):
            projects = os.listdir(project_dir)
        else:
            projects = []

        projects.extend(self._list_projects_in_cloud())
        return projects

    def _create_project_store(self,
                              project_dir: Text) -> Dict[Text, Any]:
        default_project = RasaNLUModelConfig.DEFAULT_PROJECT_NAME

        projects = self._collect_projects(project_dir)

        project_store = {}

        if self.model_server is not None:
            project_store[default_project] = load_from_server(
                self.component_builder,
                default_project,
                self.project_dir,
                self.remote_storage,
                self.model_server,
                self.wait_time_between_pulls
            )
        else:
            for project in projects:
                project_store[project] = Project(self.component_builder,
                                                 project,
                                                 self.project_dir,
                                                 self.remote_storage)

            if not project_store:
                project_store[default_project] = Project(
                    project=default_project,
                    project_dir=self.project_dir,
                    remote_storage=self.remote_storage
                )

        return project_store

    def _pre_load(self, projects: List[Text]) -> None:
        logger.debug("loading %s", projects)
        for project in self.project_store:
            if project in projects:
                self.project_store[project].load_model()

    def _list_projects_in_cloud(self) -> List[Text]:
        # noinspection PyBroadException
        try:
            from rasa_nlu.persistor import get_persistor
            p = get_persistor(self.remote_storage)
            if p is not None:
                return p.list_projects()
            else:
                return []
        except Exception:
            logger.exception("Failed to list projects. Make sure you have "
                             "correctly configured your cloud storage "
                             "settings.")
            return []

    @staticmethod
    def _create_emulator(mode: Optional[Text]) -> NoEmulator:
        """Create emulator for specified mode.

        If no emulator is specified, we will use the Rasa NLU format."""

        if mode is None:
            return NoEmulator()
        elif mode.lower() == 'wit':
            from rasa_nlu.emulators.wit import WitEmulator
            return WitEmulator()
        elif mode.lower() == 'luis':
            from rasa_nlu.emulators.luis import LUISEmulator
            return LUISEmulator()
        elif mode.lower() == 'dialogflow':
            from rasa_nlu.emulators.dialogflow import DialogflowEmulator
            return DialogflowEmulator()
        else:
            raise ValueError("unknown mode : {0}".format(mode))

    @staticmethod
    def _tf_in_pipeline(model_config: RasaNLUModelConfig) -> bool:
        from rasa_nlu.classifiers.embedding_intent_classifier import \
            EmbeddingIntentClassifier
        return any(EmbeddingIntentClassifier.name in c.values()
                   for c in model_config.pipeline)

    def extract(self, data: Dict[Text, Any]) -> Dict[Text, Any]:
        return self.emulator.normalise_request_json(data)

    def parse(self, data: Dict[Text, Any]) -> Dict[Text, Any]:
        project = data.get("project", RasaNLUModelConfig.DEFAULT_PROJECT_NAME)
        model = data.get("model")

        if project not in self.project_store:
            projects = self._list_projects(self.project_dir)

            cloud_provided_projects = self._list_projects_in_cloud()
            projects.extend(cloud_provided_projects)

            if project not in projects:
                raise InvalidProjectError(
                    "No project found with name '{}'.".format(project))
            else:
                try:
                    self.project_store[project] = Project(
                        self.component_builder, project,
                        self.project_dir, self.remote_storage)
                except Exception as e:
                    raise InvalidProjectError(
                        "Unable to load project '{}'. "
                        "Error: {}".format(project, e))

        time = data.get('time')
        response = self.project_store[project].parse(data['text'], time,
                                                     model)

        if self.responses:
            self.responses.info('', user_input=response, project=project,
                                model=response.get('model'))

        return self.format_response(response)

    @staticmethod
    def _list_projects(path: Text) -> List[Text]:
        """List the projects in the path, ignoring hidden directories."""
        return [os.path.basename(fn)
                for fn in utils.list_subdirectories(path)]

    def format_response(self, data: Dict[Text, Any]) -> Dict[Text, Any]:
        return self.emulator.normalise_response_json(data)

    def get_status(self) -> Dict[Text, Any]:
        # This will only count the trainings started from this
        # process, if run in multi worker mode, there might
        # be other trainings run in different processes we don't know about.

        return {
            "max_training_processes": self._training_processes,
            "current_training_processes": self._current_training_processes,
            "available_projects": {
                name: project.as_dict()
                for name, project in self.project_store.items()
            }
        }

    def start_train_process(self,
                            data_file: Text,
                            project: Text,
                            train_config: RasaNLUModelConfig,
                            model_name: Optional[Text] = None
                            ) -> Deferred:
        """Start a model training."""

        if not project:
            raise InvalidProjectError("Missing project name to train")

        if self._training_processes <= self._current_training_processes:
            raise MaxTrainingError

        if project in self.project_store:
            self.project_store[project].status = STATUS_TRAINING
        elif project not in self.project_store:
            self.project_store[project] = Project(
                self.component_builder, project,
                self.project_dir, self.remote_storage)
            self.project_store[project].status = STATUS_TRAINING

        def training_callback(model_path):
            model_dir = os.path.basename(os.path.normpath(model_path))
            self.project_store[project].update(model_dir)
            self._current_training_processes -= 1
            self.project_store[project].current_training_processes -= 1
            if (self.project_store[project].status == STATUS_TRAINING and
                    self.project_store[project].current_training_processes ==
                    0):
                self.project_store[project].status = STATUS_READY
            return model_path

        def training_errback(failure):
            logger.warning(failure)

            self._current_training_processes -= 1
            self.project_store[project].current_training_processes -= 1
            self.project_store[project].status = STATUS_FAILED
            self.project_store[project].error_message = str(failure)

            return failure

        logger.debug("New training queued")

        self._current_training_processes += 1
        self.project_store[project].current_training_processes += 1

        result = self.pool.submit(do_train_in_worker,
                                  train_config,
                                  data_file,
                                  path=self.project_dir,
                                  project=project,
                                  fixed_model_name=model_name,
                                  storage=self.remote_storage)
        result = deferred_from_future(result)
        result.addCallback(training_callback)
        result.addErrback(training_errback)

        return result

    # noinspection PyProtectedMember
    def evaluate(self,
                 data: Text,
                 project: Optional[Text] = None,
                 model: Optional[Text] = None) -> Dict[Text, Any]:
        """Perform a model evaluation."""

        project = project or RasaNLUModelConfig.DEFAULT_PROJECT_NAME
        model = model or None
        file_name = utils.create_temporary_file(data, "_training_data")

        if project not in self.project_store:
            raise InvalidProjectError("Project {} could not "
                                      "be found".format(project))

        model_name = self.project_store[project]._dynamic_load_model(model)

        self.project_store[project]._loader_lock.acquire()
        try:
            if not self.project_store[project]._models.get(model_name):
                interpreter = self.project_store[project]. \
                    _interpreter_for_model(model_name)
                self.project_store[project]._models[model_name] = interpreter
        finally:
            self.project_store[project]._loader_lock.release()

        return run_evaluation(
            data_path=file_name,
            model=self.project_store[project]._models[model_name],
            errors_filename=None
        )

    def unload_model(self,
                     project: Optional[Text],
                     model: Text) -> Dict[Text, Any]:
        """Unload a model from server memory."""

        if project is None:
            raise InvalidProjectError("No project specified")
        elif project not in self.project_store:
            raise InvalidProjectError("Project {} could not "
                                      "be found".format(project))

        try:
            unloaded_model = self.project_store[project].unload(model)
            return unloaded_model
        except KeyError:
            raise InvalidProjectError("Failed to unload model {} "
                                      "for project {}.".format(model, project))
             'Start':start,
             'WinSize':win,
             'GroupName':gname,
             'Subtype':sub,
             }
    if type(benj_res) is StringType:
        if (benj_res == 'Already Processed') or benj_res.startswith('Too few unique sequences'):
            continue
        print benj_res, prot, start, win
    else:
        benj_res.update(tdict)
        benj_writer.writerow(benj_res)
    
        
if multi:
    pool.shutdown()

# <codecell>


# <codecell>

#with open('allgp120.fasta', 'w') as handle:
tres = []
for key, row in wanted_data[['gp120-seq-align', 'Tropism']].dropna().iterrows():
    oname = key+'-'+row['Tropism']
    tres.append((oname, ''.join(row['gp120-seq-align'])))
    
    

# <codecell>
Example #53
0
class DataRouter(object):
    def __init__(self,
                 project_dir=None,
                 max_training_processes=1,
                 response_log=None,
                 emulation_mode=None,
                 remote_storage=None,
                 component_builder=None):
        self._training_processes = max(max_training_processes, 1)
        self._current_training_processes = 0
        self.responses = self._create_query_logger(response_log)
        self.project_dir = config.make_path_absolute(project_dir)
        self.emulator = self._create_emulator(emulation_mode)
        self.remote_storage = remote_storage

        if component_builder:
            self.component_builder = component_builder
        else:
            self.component_builder = ComponentBuilder(use_cache=True)

        self.project_store = self._create_project_store(project_dir)
        self.pool = ProcessPool(self._training_processes)

    def __del__(self):
        """Terminates workers pool processes"""
        self.pool.shutdown()

    @staticmethod
    def _create_query_logger(response_log):
        """Create a logger that will persist incoming query results."""

        # Ensures different log files for different
        # processes in multi worker mode
        if response_log:
            # We need to generate a unique file name,
            # even in multiprocess environments
            timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
            log_file_name = "rasa_nlu_log-{}-{}.log".format(timestamp,
                                                            os.getpid())
            response_logfile = os.path.join(response_log, log_file_name)
            # Instantiate a standard python logger,
            # which we are going to use to log requests
            utils.create_dir_for_file(response_logfile)
            out_file = io.open(response_logfile, 'a', encoding='utf8')
            query_logger = Logger(
                    observer=jsonFileLogObserver(out_file, recordSeparator=''),
                    namespace='query-logger')
            # Prevents queries getting logged with parent logger
            # --> might log them to stdout
            logger.info("Logging requests to '{}'.".format(response_logfile))
            return query_logger
        else:
            # If the user didn't provide a logging directory, we wont log!
            logger.info("Logging of requests is disabled. "
                        "(No 'request_log' directory configured)")
            return None

    def _collect_projects(self, project_dir):
        if project_dir and os.path.isdir(project_dir):
            projects = os.listdir(project_dir)
        else:
            projects = []

        projects.extend(self._list_projects_in_cloud())
        return projects

    def _create_project_store(self, project_dir):
        projects = self._collect_projects(project_dir)

        project_store = {}

        for project in projects:
            project_store[project] = Project(self.component_builder,
                                             project,
                                             self.project_dir,
                                             self.remote_storage)

        if not project_store:
            default_model = RasaNLUModelConfig.DEFAULT_PROJECT_NAME
            project_store[default_model] = Project(
                    project=RasaNLUModelConfig.DEFAULT_PROJECT_NAME,
                    project_dir=self.project_dir,
                    remote_storage=self.remote_storage)
        return project_store

    def _pre_load(self, projects):
        logger.debug("loading %s", projects)
        for project in self.project_store:
            if project in projects:
                self.project_store[project].load_model()

    def _list_projects_in_cloud(self):
        try:
            from rasa_nlu.persistor import get_persistor
            p = get_persistor(self.remote_storage)
            if p is not None:
                return p.list_projects()
            else:
                return []
        except Exception:
            logger.exception("Failed to list projects. Make sure you have "
                             "correctly configured your cloud storage "
                             "settings.")
            return []

    @staticmethod
    def _create_emulator(mode):
        """Create emulator for specified mode.

        If no emulator is specified, we will use the Rasa NLU format."""

        if mode is None:
            from rasa_nlu.emulators import NoEmulator
            return NoEmulator()
        elif mode.lower() == 'wit':
            from rasa_nlu.emulators.wit import WitEmulator
            return WitEmulator()
        elif mode.lower() == 'luis':
            from rasa_nlu.emulators.luis import LUISEmulator
            return LUISEmulator()
        elif mode.lower() == 'dialogflow':
            from rasa_nlu.emulators.dialogflow import DialogflowEmulator
            return DialogflowEmulator()
        else:
            raise ValueError("unknown mode : {0}".format(mode))

    def extract(self, data):
        return self.emulator.normalise_request_json(data)

    def parse(self, data):
        project = data.get("project", RasaNLUModelConfig.DEFAULT_PROJECT_NAME)
        model = data.get("model")

        if project not in self.project_store:
            projects = self._list_projects(self.project_dir)

            cloud_provided_projects = self._list_projects_in_cloud()
            projects.extend(cloud_provided_projects)

            if project not in projects:
                raise InvalidProjectError(
                        "No project found with name '{}'.".format(project))
            else:
                try:
                    self.project_store[project] = Project(
                            self.component_builder, project,
                            self.project_dir, self.remote_storage)
                except Exception as e:
                    raise InvalidProjectError(
                            "Unable to load project '{}'. "
                            "Error: {}".format(project, e))

        time = data.get('time')
        response = self.project_store[project].parse(data['text'], time,
                                                     model)

        if self.responses:
            self.responses.info('', user_input=response, project=project,
                                model=response.get('model'))

        return self.format_response(response)

    @staticmethod
    def _list_projects(path):
        """List the projects in the path, ignoring hidden directories."""
        return [os.path.basename(fn)
                for fn in utils.list_subdirectories(path)]

    def parse_training_examples(self, examples, project, model):
        # type: (Optional[List[Message]], Text, Text) -> List[Dict[Text, Text]]
        """Parses a list of training examples to the project interpreter"""

        predictions = []
        for ex in examples:
            logger.debug("Going to parse: {}".format(ex.as_dict()))
            response = self.project_store[project].parse(ex.text,
                                                         None,
                                                         model)
            logger.debug("Received response: {}".format(response))
            predictions.append(response)

        return predictions

    def format_response(self, data):
        return self.emulator.normalise_response_json(data)

    def get_status(self):
        # This will only count the trainings started from this
        # process, if run in multi worker mode, there might
        # be other trainings run in different processes we don't know about.

        return {
            "max_training_processes": self._training_processes,
            "current_training_processes": self._current_training_processes,
            "available_projects": {
                name: project.as_dict()
                for name, project in self.project_store.items()
            }
        }

    def start_train_process(self,
                            data_file,  # type: Text
                            project,  # type: Text
                            train_config,  # type: RasaNLUModelConfig
                            model_name=None  # type: Optional[Text]
                            ):
        # type: (...) -> Deferred
        """Start a model training."""

        if not project:
            raise InvalidProjectError("Missing project name to train")

        if project in self.project_store:
            if self._training_processes <= self._current_training_processes:
                raise MaxTrainingError
            else:
                self.project_store[project].status = 1
        elif project not in self.project_store:
            self.project_store[project] = Project(
                    self.component_builder, project,
                    self.project_dir, self.remote_storage)
            self.project_store[project].status = 1

        def training_callback(model_path):
            model_dir = os.path.basename(os.path.normpath(model_path))
            self.project_store[project].update(model_dir)
            self._current_training_processes -= 1
            self.project_store[project].current_training_processes -= 1
            if (self.project_store[project].status == 1 and
                    self.project_store[project].current_training_processes ==
                    0):
                self.project_store[project].status = 0
            return model_dir

        def training_errback(failure):
            logger.warn(failure)
            target_project = self.project_store.get(
                    failure.value.failed_target_project)
            self._current_training_processes -= 1
            self.project_store[project].current_training_processes -= 1
            if (target_project and
                    self.project_store[project].current_training_processes ==
                    0):
                target_project.status = 0
            return failure

        logger.debug("New training queued")

        self._current_training_processes += 1
        self.project_store[project].current_training_processes += 1

        result = self.pool.submit(do_train_in_worker,
                                  train_config,
                                  data_file,
                                  path=self.project_dir,
                                  project=project,
                                  fixed_model_name=model_name,
                                  storage=self.remote_storage)
        result = deferred_from_future(result)
        result.addCallback(training_callback)
        result.addErrback(training_errback)

        return result

    def evaluate(self, data, project=None, model=None):
        # type: (Text, Optional[Text], Optional[Text]) -> Dict[Text, Any]
        """Perform a model evaluation."""

        project = project or RasaNLUModelConfig.DEFAULT_PROJECT_NAME
        model = model or None
        file_name = utils.create_temporary_file(data, "_training_data")
        test_data = load_data(file_name)

        if project not in self.project_store:
            raise InvalidProjectError("Project {} could not "
                                      "be found".format(project))

        preds_json = self.parse_training_examples(test_data.intent_examples,
                                                  project,
                                                  model)

        predictions = [
            {"text": e.text,
             "intent": e.data.get("intent"),
             "predicted": p.get("intent", {}).get("name"),
             "confidence": p.get("intent", {}).get("confidence")}
            for e, p in zip(test_data.intent_examples, preds_json)
        ]

        y_true = [e.data.get("intent") for e in test_data.intent_examples]
        y_true = clean_intent_labels(y_true)

        y_pred = [p.get("intent", {}).get("name") for p in preds_json]
        y_pred = clean_intent_labels(y_pred)

        report, precision, f1, accuracy = get_evaluation_metrics(y_true,
                                                                 y_pred)

        return {
            "intent_evaluation": {
                "report": report,
                "predictions": predictions,
                "precision": precision,
                "f1_score": f1,
                "accuracy": accuracy}
        }

    def unload_model(self, project, model):
        # type: (Text, Text) -> Dict[Text]
        """Unload a model from server memory."""

        if project is None:
            raise InvalidProjectError("No project specified")
        elif project not in self.project_store:
            raise InvalidProjectError("Project {} could not "
                                      "be found".format(project))

        try:
            unloaded_model = self.project_store[project].unload(model)
            return unloaded_model
        except KeyError:
            raise InvalidProjectError("Failed to unload model {} "
                                      "for project {}.".format(model, project))
Example #54
0
    def compute_heatmap(self,
                        reader, plot_type, time_mode, time_interval,
                        cache_size=-1,
                        num_of_pixel_of_time_dim=-1,
                        num_of_threads=os.cpu_count(), **kwargs):
        """
            calculate the data for plotting heatmap

        :param reader: reader for data
        :param plot_type: types of data, see heatmap (function) for details
        :param time_mode: real time (r) or virtual time (v)
        :param time_interval: the window size in computation
        :param cache_size: size of cache
        :param num_of_pixel_of_time_dim: as an alternative to time_interval, useful when you don't know the trace time span
        :param num_of_threads: number of threads/processes to use for computation, default: all
        :param kwargs: cache_params,
        :return:  a two-dimension list, the first dimension is x, the second dimension is y, the value is the heat value
        """


        bp = get_breakpoints(reader, time_mode, time_interval, num_of_pixel_of_time_dim)
        ppe = ProcessPoolExecutor(max_workers=num_of_threads)
        futures_dict = {}
        progress = 0
        xydict = np.zeros((len(bp)-1, len(bp)-1))


        if plot_type in [
            "avg_rd_st_et",
            "rd_distribution",
            "rd_distribution_CDF",
            "future_rd_distribution",
            "dist_distribution",
            "rt_distribution"
        ]:
            pass

        elif plot_type == "hr_st_et":
            ema_coef = kwargs.get("ema_coef", DEF_EMA_HISTORY_WEIGHT)
            enable_ihr = kwargs.get("interval_hit_ratio", False) or kwargs.get("enable_ihr", False)


            if kwargs.get("algorithm", "LRU").lower() == "lru":
                #TODO: replace CLRUProfiler with PyLRUProfiler
                rd = LRUProfiler(reader).get_reuse_distance()
                last_access_dist = get_last_access_dist(reader)

                for i in range(len(bp) - 1):
                    futures_dict[ppe.submit(cal_hr_list_LRU, rd, last_access_dist,
                                            cache_size, bp, i, enable_ihr=enable_ihr, ema_coef=ema_coef)] = i
            else:
                reader_params = reader.get_params()
                reader_params["open_c_reader"] = False
                cache_class = cache_name_to_class(kwargs.get("algorithm"))
                cache_params = kwargs.get("cache_params", {})

                for i in range(len(bp) - 1):
                    futures_dict[ppe.submit(cal_hr_list_general, reader.__class__, reader_params,
                                            cache_class, cache_size, bp, i, cache_params=cache_params)] = i

        elif plot_type == "hr_st_size":
            raise RuntimeError("Not Implemented")

        elif plot_type == "KL_st_et":
            rd = LRUProfiler(reader).get_reuse_distance()

            for i in range(len(bp) - 1):
                futures_dict[ppe.submit(cal_KL, rd, bp, i)] = i


        else:
            ppe.shutdown()
            raise RuntimeError("{} is not a valid heatmap type".format(plot_type))


        last_progress_print_time = time.time()
        for future in as_completed(futures_dict):
            result = future.result()
            xydict[-len(result):, futures_dict[future]] = np.array(result)
            # print("{} {}".format(xydict[futures_dict[future]], np.array(result)))
            progress += 1
            if time.time() - last_progress_print_time > 20:
                INFO("{:.2f}%".format(progress / len(futures_dict) * 100), end="\r")
                last_progress_print_time = time.time()

        ppe.shutdown()
        return xydict
Example #55
0
class BokehTornado(TornadoApplication):
    ''' A Tornado Application used to implement the Bokeh Server.

        The Server class is the main public interface, this class has
        Tornado implementation details.

    Args:
        applications (dict of str : bokeh.application.Application) : map from paths to Application instances
            The application is used to create documents for each session.
        extra_patterns (seq[tuple]) : tuples of (str, http or websocket handler)
            Use this argmument to add additional endpoints to custom deployments
            of the Bokeh Server.

    '''

    def __init__(self, applications, io_loop=None, extra_patterns=None):
        if io_loop is None:
            io_loop = IOLoop.current()
        self._loop = io_loop

        self._resources = {}

        # Wrap applications in ApplicationContext
        self._applications = dict()
        for k,v in applications.items():
            self._applications[k] = ApplicationContext(v, self._loop)

        extra_patterns = extra_patterns or []
        relative_patterns = []
        for key in applications:
            for p in per_app_patterns:
                if key == "/":
                    route = p[0]
                else:
                    route = key + p[0]
                relative_patterns.append((route, p[1], { "application_context" : self._applications[key] }))
        websocket_path = None
        for r in relative_patterns:
            if r[0].endswith("/ws"):
                websocket_path = r[0]
        if not websocket_path:
            raise RuntimeError("Couldn't find websocket path")
        for r in relative_patterns:
            r[2]["bokeh_websocket_path"] = websocket_path

        all_patterns = extra_patterns + relative_patterns + toplevel_patterns
        log.debug("Patterns are: %r", all_patterns)
        super(BokehTornado, self).__init__(all_patterns, **settings)

        self._clients = set()
        self._executor = ProcessPoolExecutor(max_workers=4)
        self._loop.add_callback(self._start_async)
        self._stats_job = PeriodicCallback(self.log_stats, 15.0 * 1000, io_loop=self._loop)
        self._stats_job.start()
        self._unused_session_linger_seconds = 60*30
        self._cleanup_job = PeriodicCallback(self.cleanup_sessions, 17.0 * 1000, io_loop=self._loop)
        self._cleanup_job.start()

    @property
    def io_loop(self):
        return self._loop

    def root_url_for_request(self, request):
        # If we add a "whole server prefix," we'd put that on here too
        return request.protocol + "://" + request.host + "/"

    def websocket_url_for_request(self, request, websocket_path):
        protocol = "ws"
        if request.protocol == "https":
            protocol = "wss"
        return protocol + "://" + request.host + websocket_path

    def resources(self, request):
        root_url = self.root_url_for_request(request)
        if root_url not in self._resources:
            self._resources[root_url] = Resources(mode="server", root_url=root_url)
        return self._resources[root_url]

    def start(self):
        ''' Start the Bokeh Server application main loop.

        Args:

        Returns:
            None

        Notes:
            Keyboard interrupts or sigterm will cause the server to shut down.

        '''
        try:
            self._loop.start()
        except KeyboardInterrupt:
            print("\nInterrupted, shutting down")

    def stop(self):
        ''' Stop the Bokeh Server application.

        Returns:
            None

        '''
        self._loop.stop()

    @property
    def executor(self):
        return self._executor

    def new_connection(self, protocol, socket, application_context, session):
        connection = ServerConnection(protocol, socket, application_context, session)
        self._clients.add(connection)
        return connection

    def client_lost(self, connection):
        self._clients.discard(connection)
        connection.detach_session()

    def get_session(self, app_path, session_id):
        if app_path not in self._applications:
            raise ValueError("Application %s does not exist on this server" % app_path)
        return self._applications[app_path].get_session(session_id)

    def cleanup_sessions(self):
        for app in self._applications.values():
            app.cleanup_sessions(self._unused_session_linger_seconds)

    def log_stats(self):
        log.debug("[pid %d] %d clients connected", os.getpid(), len(self._clients))

    @gen.coroutine
    def run_in_background(self, _func, *args, **kwargs):
        """
        Run a synchronous function in the background without disrupting
        the main thread. Useful for long-running jobs.
        """
        res = yield self._executor.submit(_func, *args, **kwargs)
        raise gen.Return(res)

    @gen.coroutine
    def _start_async(self):
        try:
            atexit.register(self._atexit)
            signal.signal(signal.SIGTERM, self._sigterm)
        except Exception:
            self.exit(1)

    _atexit_ran = False
    def _atexit(self):
        if self._atexit_ran:
            return
        self._atexit_ran = True

        self._stats_job.stop()
        IOLoop.clear_current()
        loop = IOLoop()
        loop.make_current()
        loop.run_sync(self._cleanup)

    def _sigterm(self, signum, frame):
        print("Received SIGTERM, shutting down")
        self.stop()
        self._atexit()

    @gen.coroutine
    def _cleanup(self):
        log.debug("Shutdown: cleaning up")
        self._executor.shutdown(wait=False)
        self._clients.clear()
Example #56
0
class AMRProcessor:

    def __init__(self, max_workers=None, verbose=False):
        self.verbose = verbose
        self.pool = ProcessPoolExecutor(max_workers=max_workers) if max_workers is None or max_workers > 1 else None

    def shutdown(self, wait=True):
        if self.pool:
            self.pool.shutdown(wait=wait)

    def sentences(self, gold_lines, silver_lines, verbose=False, seed=None, loop=None):

        def extract_amr_pairs(gold_lines, silver_lines):
            while True:
                gold_amr = AMR.read(gold_lines)
                silver_amr = AMR.read(silver_lines)
                if gold_amr is None and silver_amr is None:
                    break
                elif gold_amr is None or silver_amr is None:
                    raise Exception('mismatched AMR count')
                yield gold_amr, silver_amr

        AMRPair.seed = seed

        self.total_match_num = 0
        self.total_test_num = 0
        self.total_gold_num = 0

        self.skipped = 0
        self.good = 0

        nr = 0

        def process_sentence(sentence):
            nonlocal nr

            sentence = Dict(sentence)
            sentence.gold = Dict(sentence.gold)
            sentence.silver = Dict(sentence.silver)

            nr += 1

            if not sentence:
                if verbose:
                    print('Skipping sentence:', gold_amr.text)
                self.skipped += 1
                return

            if verbose:
                print(sentence.gold.text)

            gold_triple_num = len(sentence.gold.instances) + len(sentence.gold.attributes) + len(sentence.gold.relations)
            test_triple_num = len(sentence.silver.instances) + len(sentence.silver.attributes) + len(sentence.silver.relations)
            gold_triple_num += 1 if sentence.gold.top else 0
            test_triple_num += 1 if sentence.silver.top else 0

            # if each AMR pair should have a score, compute and output it here
            sentence.precision, sentence.recall, sentence.best_f_score = smatch.compute_f(sentence.best_match_num, test_triple_num, gold_triple_num)

            # sentence.precision = precision
            # sentence.recall = recall
            # sentence.best_f_score = best_f_score
            
            self.total_match_num += sentence.best_match_num
            self.total_test_num += test_triple_num
            self.total_gold_num += gold_triple_num

            if verbose:
                print()
                print("Precision: %.4f" % sentence.precision)
                print("Recall: %.4f" % sentence.recall)
                print("Smatch score: %.4f" % sentence.best_f_score)
                print()
            else:
                print('.', end='', flush=True)

            self.good += 1

            sentence.nr = nr

            return sentence

        if loop is None:
            loop = asyncio.get_event_loop()

        class AMap:
            def __init__(self, func, futures):
                self.func = func
                self.futures = iter(futures)
            async def __aiter__(self):
                return self
            async def __anext__(self):
                try:
                    future = next(self.futures)
                except StopIteration:
                    raise StopAsyncIteration
                return self.func(await future)

        results = list(loop.run_in_executor(self.pool, AMRPair.make, amr_pair) for amr_pair in extract_amr_pairs(gold_lines, silver_lines))

        return AMap(process_sentence, results)

        # for result in results:
        #     sentence = await result
        #     sentence = process_sentence(sentence)
    
        # for sentence in (self.pool.map if self.pool else map)(AMRPair.make, extract_amr_pairs(gold_lines, silver_lines)):
        #     sentence = process_sentence(sentence)
        #     yield sentence

    async def __call__(self, gold_lines, silver_lines, verbose=False, seed=None):

        sentences = []
        async for sentence in self.sentences(gold_lines, silver_lines, verbose=verbose, seed=seed):
            sentences.append(sentence)

        # sentences = list(self.sentences(gold_lines, silver_lines, verbose=verbose, seed=seed))

        precision, recall, best_f_score = smatch.compute_f(self.total_match_num, self.total_test_num, self.total_gold_num)

        if verbose:
            print("Total:")
            print()
            print("Precision: %.4f" % precision)
            print("Recall: %.4f" % recall)
            print("Smatch score: %.4f" % best_f_score)

        return Dict(sentences=sentences, precision=precision, recall=recall, best_f_score=best_f_score, skipped=self.skipped, good=self.good)