def firstlevel(inputs, args):
    commands = list()
    imagelist = list()
    for i, subject in enumerate(inputs, start=0):
        if not is_non_zero_file("output/subject{}/COMPLETE".format(i)):
            # Base command
            command = f"{args.modelbuild_command} -d 3 "
            # Setup directory and naming
            command += "-o output/subject{}/subject{}_ ".format(i, i)
            # Defaults to bootstrap modelbuilds with rigid prealignmnet,
            # no rigid update
            command += "-r 1 -l 1 -y 0 "
            # Model build setup
            command += "-c {} -a {} -e {} -g {} -i {} -n {} -m {} -t {} -u {} -v {} ".format(
                args.cluster_type,
                args.average_type,
                args.float,
                args.gradient_step,
                args.model_iterations,
                int(args.N4),
                args.metric,
                args.transform,
                args.walltime,
                args.memory_request,
            )
            # Registrations Setup
            command += "-q {} -f {} -s {} ".format(args.reg_iterations,
                                                   args.reg_shrinks,
                                                   args.reg_smoothing)
            if args.rigid_model_target:
                command += "-z {} ".format(args.rigid_model_target)
            command += " ".join(subject)
            command += " && echo DONE > output/subject{}/COMPLETE".format(i)
            commands.append(command)

        imagelist.append(
            subject +
            ["output/subject{0}/subject{0}_template0.nii.gz".format(i)])
    # Here we should add the ability to limit the number of commands submitted
    results = list()
    if len(commands) > 0:
        if args.cluster_type != 0:
            pool = multiprocessing.ProcessPool(nodes=len(commands))
        else:
            pool = multiprocessing.ProcessPool(nodes=args.local_threads)

        for item in tqdm.tqdm(
                pool.uimap(lambda x: run_command(x, args.dry_run), commands),
                total=len(commands),
        ):
            results.append(item)
        if not args.dry_run:
            for i, subject in enumerate(results, start=0):
                with open("output/subject{0}/subject{0}.log".format(i),
                          "wb") as logfile:
                    logfile.write(subject.stdout)
        pool.close()
        # Needed to completely destroy the pool so that pathos doesn't reuse
        pool.clear()
    secondlevel(imagelist, args, secondlevel=True)
Beispiel #2
0
def partition_table(tbl, partition_count, drop=True):
    """
    Partitions a biom table into n parts by sample

    Parameters
    ----------
    tbl: biom.Table
        table we are partitioning
    partition_count: int
        number of partitions to partition into
    drop: bool
        whether to drop columns as we partition, set to True to be
        less memory-expensive

    Returns
    -------
    list of biom.Table of length partition_count
    processor pool we should re-use
    """
    print("partition_Table() starting at " + time.strftime("[%H:%M:%S]"), flush=True)
    sids = tbl.ids()
    id_parts = np.array_split(sids, partition_count)

    pool = mp.ProcessPool(nodes=len(id_parts), maxtasksperchild=1)
    args = [(tbl, x, drop) for x in id_parts]
    results = pool.map(index_tbl, args)
    return results, pool
Beispiel #3
0
def make_dataset(data_path, cfg, mode='training'):
    try:
        assert cfg.serialized

        return h5_wrapper(cfg.dump_path)
    except:
        track_pd = pd.read_csv(os.path.join(data_path,
                                            "fma_metadata/tracks.csv"),
                               index_col=0,
                               header=[0, 1])
        #track_pd = track_pd[track_pd.set.subset == 'small']
        track_pd = track_pd[track_pd.set.split == mode]
        track_pd = track_pd[track_pd.track.genre_top.notnull()]
        pair_list = list()
        for id_, info in tqdm(track_pd.iterrows(), total=len(track_pd.index)):
            tid_str = '{:06d}'.format(id_)
            audio_path = os.path.join(data_path, "fma_medium", tid_str[:3],
                                      tid_str + '.mp3')
            pair_list.append((audio_path, info.track.genre_top))

        p = pmp.ProcessPool(96)
        proc_fn = _proc_func()
        data_iter = p.imap(proc_fn, pair_list)
        with tqdm(total=len(pair_list)) as pbar:
            for i, res in enumerate(tqdm(data_iter)):
                pbar.update()
                if cfg.dump_path is not None and cfg.need_serialize:
                    if res is not None:
                        dump_to_h5(cfg.dump_path, res)

        return h5_wrapper(cfg.dump_path)
def BuildLeftRightList(transcript):
    mean_left = []
    mean_right = []

    ## this is so dumb, but it is how I have to do this because multiprocess needs to inherit transcript
    def SamIteration(i):
        name = input.name.split(".")[0]
        samname = name + ".bam"
        samfile = pysam.AlignmentFile(samname, "rb")
        samples = list(samfile.fetch(transcript, i, i + 1))
        tmp_left = []
        tmp_right = []
        ## Downsample to avoid huge calculations and maybe biases?
        if len(samples) > 100:
            new_samples = random.sample(samples, 100)
        else:
            new_samples = samples
        for x in samples:
            tmp_left.append(int(i - x.reference_start))
            tmp_right.append(int(x.reference_end - i))
        return ([np.mean(tmp_left), np.mean(tmp_right)])

    p = mp.ProcessPool(nodes=procs)
    means = p.map(SamIteration,
                  range(0, samfile.get_reference_length(transcript)))
    p.clear()
    p.close()
    p.join()

    for mean in means:
        mean_left.append(mean[0])
        mean_right.append(mean[1])
    return (mean_left, mean_right)
Beispiel #5
0
    def initialize(self):
        cfg = self.config
        self.mode = cfg.mode
        proc_fn = _proc_func(self.config)
        self.proc_fn = proc_fn
        audio_fn_list = glob(cfg.audio_pattern)
        '''
        try:
            h5_file = h5py.File(cfg.dump_path, 'r', libver='latest', swmr=True)
            fn_list = list(h5_file.keys())
            audio_fn_list = list(set(audio_fn_list) - set(fn_list))
        except:
            pass
        '''

        self.note_fn_list, self.audio_fn_list = self._match_pair(
            sorted(glob(cfg.note_pattern)), audio_fn_list)
        if cfg.dump_path is not None and cfg.serialized:
            #self.data = h5py.File(cfg.dump_path, 'r', libver='latest', swmr=True)
            return None

        #profile
        p = pmp.ProcessPool(cfg.num_workers)
        data_iter = p.imap(proc_fn, zip(self.audio_fn_list, self.note_fn_list))
        self.data = list()
        with tqdm(total=len(self.audio_fn_list)) as pbar:
            for i, res in enumerate(tqdm(data_iter)):
                pbar.update()
                if cfg.dump_path is not None and cfg.need_serialize:
                    if res is not None:
                        dump_to_h5(cfg.dump_path, res)
Beispiel #6
0
 def run(self):
     procs = config.procs or (multiprocessing.cpu_count() - 1)
     self.pool = multiprocessing.ProcessPool(nodes=procs)
     for articles in self.articles_sequence(procs=procs):
         results = self.pool.map(ArticleCrawler.crawling, articles)
         for rows in list(results):
             self.write_handler(rows)
         if self.status != 'active':
             self.status = 'terminated'
             break
         self.pool.restart()
Beispiel #7
0
    def exec(self,
             oIterable,
             oFunc,
             sEnvironment,
             chunkSize=1,
             pReduceFunc=None):

        self.pool = mp.ProcessPool(self.nprocs)
        allResults = []

        resultObj = None

        for x in self.chunkIterable(oIterable, chunkSize):
            allResults.append(self.pool.apipe(oFunc, x, sEnvironment))

        self.pool.close()

        while len(allResults) > 0:

            i = 0
            while i < len(allResults):

                if allResults[i].ready():

                    result = allResults[i].get()

                    if pReduceFunc != None:

                        resultObj = pReduceFunc(resultObj, result,
                                                sEnvironment)

                    else:

                        if resultObj is None:
                            resultObj = []

                        resultObj.append(result)

                    del allResults[i]

                else:
                    i += 1

            time.sleep(0.5)

        print("Pool Join")
        self.pool.join()
        print("Pool Clear")
        self.pool.clear()
        print("Pool closed")

        return resultObj
    def P2( self ):
        t1 = time.time()
        p = mp.ProcessPool( 4 )
        HelpLists = []
        results = []
        for _ in range(10):
            results.append( p.apipe( self.PlayOut4, ((10,1),(30,14)) , self.PositionDictManager ) )
        for r in results:
            HelpLists.append( r.get() )

        p.close()
        p.join()
        p.clear()
        t2 = time.time()
        print "2 Finish", t2 - t1
def run_level(
    model_name: str,
    level_lambda: Dict,
    level: int,
    location_ids: List[int],
    model_data: pd.DataFrame,
    hierarchy: pd.DataFrame,
    prior_dicts: Dict,
    var_args: Dict,
    location_prior_dict: Dict,
    child_cutoff_level: int,
    global_mr_data: MRData,
    verbose: bool,
):
    level_mr_model_dict = {}
    level_prior_dicts = {}

    if verbose:
        logger.info(f'Modeling hierarchy level {level}.')
    _rl = functools.partial(
        run_location,
        model_name=model_name,
        level_lambda=level_lambda,
        level=level,
        model_data=model_data,
        hierarchy=hierarchy,
        prior_dicts=prior_dicts,
        var_args=var_args,
        location_prior_dict=location_prior_dict,
        child_cutoff_level=child_cutoff_level,
        global_mr_data=global_mr_data,
    )
    with multiprocessing.ProcessPool(int(OMP_NUM_THREADS)) as p:
        results = list(
            tqdm(p.imap(_rl, location_ids),
                 total=len(location_ids),
                 file=sys.stdout))
    level_mr_model_dict = {
        location_id: result[0]
        for location_id, result in zip(location_ids, results)
    }
    level_prior_dicts = {
        location_id: result[1]
        for location_id, result in zip(location_ids, results)
    }

    return level_mr_model_dict, level_prior_dicts
    def P1( self, GameStateActionList ):
        p = mp.ProcessPool( 4 )
        #p = mp.Pool( processes=4 )  
        t1 = time.time()
        
        #print "Parallel Begin"
        ActionSeriesLists = []
        results = []
        for gs, a in GameStateActionList:
            results.append( p.apipe( self.PlayOut3, gs, a, self.PositionDictManager ) ) 
        for r in results:
            ActionSeriesLists.append( r.get() )

        p.close()
        p.join()
        p.clear()
        
        t2 = time.time()
        #print "P1 Finish!", t2 - t1
         
        return ActionSeriesLists
Beispiel #11
0
    def validate_images(self, fd, image_hashes=None, remove_invalid=True):
        to_val_hashes = [filename.split('.')[0] for filename in os.listdir(fd)] \
                if image_hashes is None else image_hashes

        n_worker = max(1, mp.cpu_count())
        pool = mp.ProcessPool(n_worker)
        # be careful when len(to_val_hashes) < n_worker
        chunk_size = max(len(to_val_hashes) / n_worker, 1)
        to_val_splits = []
        i = 0
        while i < len(to_val_hashes):
            j = min(i+chunk_size, len(to_val_hashes))
            to_val_splits.append(to_val_hashes[i:j])
            i = j

        n_splits = len(to_val_splits)
        invalid_hashes = pool.map(validate_images, to_val_splits, [fd]*n_splits,
                [remove_invalid]*n_splits)
        invalid_hashes = [h for invalid_split in invalid_hashes for h in invalid_split]
        print('found {} invalid images'.format(len(invalid_hashes)))
        return invalid_hashes
Beispiel #12
0
    def compile_dict(self, docs):

        import pathos.multiprocessing as mp
        import time

        start = time.time()

        pool = mp.ProcessPool(nodes=self.processes)
        cleaned = pool.amap(self.clean_text, docs)

        print('pool done:', time.time() - start)

        cleaned = cleaned.get()

        print('get done:', time.time() - start)

        dicts = [self.tkn(doc, False) for doc in cleaned]

        print('tokenize:', time.time() - start)

        return dicts
    def evaluate( self, argv_list ):
        score_dict = dict()
        print "c"*50
        p = mp.ProcessPool( 30 )
        results = []
        for argv in argv_list:
            #print "argv", argv
            #print "argv", argv
            #print "argv-weights", argv.Param_Weights 
            results.append( p.apipe( MP, argv ) )
	
        #print "result argv" 
        for r in results:
            c = r.get() 
            scores, redWinRate, blueWinRate, current_serial_info = c
            print "=" * 50
            print scores, redWinRate, blueWinRate, current_serial_info
            print "current_serial_info"
            print current_serial_info  
            agent_index, CurrentGameNum, CurrentAverageScore, CurrentWinRate = current_serial_info
            if score_dict.get( agent_index ) is None:
               score_dict[ agent_index ] = ( agent_index, CurrentGameNum, CurrentAverageScore, CurrentWinRate )
            agent_index, CurrentGameNum, CurrentAverageScore, CurrentWinRate = score_dict.get( agent_index )
                        
            SumGameNum = NumGames + CurrentGameNum
            ### the following set if specific for red team
            # SumScore = CurrentGameNum * CurrentAverageScore + sum( scores )
            # SumWin = CurrentGameNum * CurrentWinRate + NumGames * redWinRate

            ### the following set is specfic for blue team 
            SumScore = CurrentGameNum * CurrentAverageScore + sum( scores )
            SumWin = CurrentGameNum * CurrentWinRate + NumGames * redWinRate
            current_serial_info = ( agent_index, SumGameNum, SumScore / float( SumGameNum), SumWin / float( SumGameNum ) )  
            score_dict[ agent_index ] =  current_serial_info
            
            score_mat = np.zeros( [ self.NumUnit, 4 ] )
            for index in range(len(score_mat)):
                score_mat[ index ] = score_dict.get( index )
 
        return score_mat
Beispiel #14
0
def conv_n_filter(input_file_name,
                  output_file_name,
                  input_format="fastq",
                  output_format="fasta",
                  max_stop_codons=3,
                  multiprocessing=False):
    """
    Convert sequences from file to another format, replace <:> from read names\
    with <_> and filter out those with number of stop codons above maximum\
    value. Write results to file.

    Parameters
    -------
    input_file_name: str
        Path to input file.
    output_file_name: str
        Path to output file.
    input_format: str
        Input file format. Default <fastq>.
    output_format: str
        Output file format. Default <fasta>.
    max_stop_codons: int
        Maximum value of stop codons allowed. Default: <3>.
    multiprocessing: bool
        Experimental. Use all the cores on a given machine. Default <False>.
    """
    def f(i):
        print "Processing {}...".format(i)
        records_list = list(SeqIO.parse(i, format=input_format))
        filtered = find_stop_codons(threshold=max_stop_codons,
                                    records=records_list)
        with open(output_file_name, "w") as fout:
            SeqIO.write(filtered, fout, format=output_format)
        print "DONE!"
    if multiprocessing is True:
        ptmp.ProcessPool().map(sanitize_names, input_file_name)
        ptmp.ProcessingPool().map(f, input_file_name)
    else:
        sanitize_names(input_file_name, input_file_name)
        f(input_file_name)
Beispiel #15
0
 def evolution_func(experiment_name,
                    pop_size,
                    num_epochs,
                    cur_epoch=1,
                    num_threads=-1):
     pool = pathos_mp.ProcessPool(
         num_threads) if num_threads > 1 else None
     pickle_pop_size_num_epochs(experiment_name, pop_size, num_epochs)
     pop, operators = generate_pop(pop_size)
     pickle_models_and_operators(pop, operators, experiment_name,
                                 cur_epoch, pop_size)
     while cur_epoch <= num_epochs:
         rewards = compute_rewards(pool, pop, cur_epoch, pop_size,
                                   experiment_name)
         pickle_rewards(rewards, experiment_name)
         print("Epoch {}, best reward is {}".format(
             cur_epoch, max(rewards)))
         cur_epoch += 1
         if cur_epoch <= num_epochs:
             pop, operators = get_new_pop_func(pop, rewards, pop_size)
             pickle_models_and_operators(pop, operators,
                                         experiment_name, cur_epoch,
                                         pop_size)
Beispiel #16
0
def batch_process(num_workers=8, multiprocess=False, multithread=False):
    assert (multiprocess and multithread) is False, \
        "Either multiprocess or multithread can be True, not both"

    if multiprocess:
        pool = multiprocessing.ProcessPool(num_workers)
    elif multithread:
        pool = multiprocessing.ThreadPool(num_workers)
    else:
        pool = None

    def decorator(single_sample_fn):
        def wrapper(*tensors, ndim=2, **kwargs):
            device = tensors[0].device
            tensors = [t.detach().cpu().numpy() for t in tensors]

            if tensors[0].ndim == ndim:
                out = single_sample_fn(*tensors, **kwargs)
            elif tensors[0].ndim == ndim + 1:

                def single_sample_fn_(args):
                    return single_sample_fn(*args, **kwargs)

                if pool:
                    outs = pool.map(single_sample_fn_, zip(*tensors))
                else:
                    outs = [single_sample_fn_(args) for args in zip(*tensors)]
                out = np.stack(outs)
            else:
                raise ValueError("The input tensor must have either {} "
                                 "or {} dimensions".format(ndim, ndim + 1))

            return torch.as_tensor(out, device=device)

        return wrapper

    return decorator
Beispiel #17
0
def _get_worker():
    # Return the map function for either single core or multi core
    # calculations based on the \'multi_cpu\' flag in run_with_configuration.

    # Note: Windows cannot perform multi core calculations.

    #    Returns:
    #        map (builtin python map function) or
    #        map (method from the multiprocessing.Pool object) """

    if ps.config.get_setting(ps.MULTI_CPU):
        if not (os.name == 'posix'):
            raise RuntimeError('Multi processing not available in Windows')

        pool = multiprocessing.ProcessPool(NCORES)
        worker = pool.map
        msg = '---MULTI CPU CALCULATIONS STARTED with {0} cpu\'s---\n'
        ps.logger.info(msg.format(NCORES))

    else:
        worker = map
        pool = None
        ps.logger.info('---SINGLE CPU CALCULATIONS STARTED----')
    return pool, worker
Beispiel #18
0
    parser_edge = subparsers.add_parser('edge')
    parser_edge.set_defaults(paths=False)

    args = parser.parse_args()
    if args.paths:
        output_dir_tl = os.path.join(OUTPUT_DIR_ROOT,
                                     'fleischer-with-paths-format')
        fname_placeholder = '{}_' + '{}-paths_edge_disjoint-{}_dist_metric-{}.txt'.format(
            args.num_paths, args.edge_disjoint, args.dist_metric)
    else:
        output_dir_tl = os.path.join(OUTPUT_DIR_ROOT, 'fleischer-edge-format')
        fname_placeholder = '{}.txt'

    run_args = []
    for slice in range(5):
        args.slices = [slice]
        problems = get_problems(args)
        output_dir = os.path.join(output_dir_tl, 'slice-{}'.format(slice))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for prob_name, topo_fname, tm_fname in problems:
            output_fname = os.path.join(
                output_dir,
                fname_placeholder.format(os.path.basename(tm_fname)))
            run_args.append(
                (prob_name, topo_fname, tm_fname, output_fname, args))

    pool = multiprocessing.ProcessPool(14)
    pool.map(serialize, run_args)
    def predict_eps_parallel(self):
        # 排除那些不能交易的股票的数据
        for item, df in self.reg_data.iteritems():
            self.reg_data[item] = self.reg_data[item].where(self.strategy_data.if_tradable['if_tradable'],
                                                            np.nan)
        # 按照论文的方法排出金融股和上市两年之内的股票
        self.exclude_financial_and_new()

        # 储存预测的eps的df
        temp_df = self.reg_data.iloc[0] * np.nan
        predicted_eps_all = pd.Panel({'y1':temp_df, 'y2':temp_df, 'y3':temp_df})

        # 将数据取出, 因为并行中使用的数据不能在self中
        reg_data = self.reg_data * 1
        holding_days = self.holding_days

        # 定义计算单次时间的函数
        def one_time_eps(cursor):
            # 取当前调仓日前的所有数据, 用所有的数据来进行模型拟合
            # 原始论文中使用了过去10年的数据, 但是由于我们数据一共也才10年, 因此全部使用上
            time = holding_days[cursor]
            # 储存一期数据的df
            one_time_data = pd.DataFrame(np.nan, index=reg_data.minor_axis, columns=['y1', 'y2', 'y3'])
            for horizon in [4, 8, 12]:
                item_str = 'y'+ str(int(horizon/4))
                # 将align_reg_data函数在这里展开
                # [y, x] = self.align_reg_data(time, horizon=h)
                ###########################################################################################
                # 用ni_ttm的数据来判断财报的年份
                curr_ni = reg_data['NetIncome_ttm', :time, :]
                # 每支股票一共有多少期财报的数据
                unique_terms = curr_ni.apply(lambda x: x.nunique(), axis=0)
                # 储存y和x
                y = pd.Series()
                x = pd.DataFrame()

                # 开始循环每支股票
                for cursor, code in enumerate(unique_terms.index):
                    curr_unique_terms = unique_terms[code]
                    # 如果当前股票期数不足
                    if curr_unique_terms < horizon + 1:
                        continue
                    curr_stock_ni = curr_ni.ix[:, code]
                    # 丢弃重复数据, 只留第一期, 即用每个财报周期的第一天的数据来对下个财报周期的第一天的数据进行预测
                    unique_ni = curr_stock_ni.drop_duplicates(keep='first').dropna()

                    # 开始循环, 将满足当前回归的数据放入y和x中
                    i = 1
                    while i + horizon <= curr_unique_terms:
                        # 发出预测的那个时间点, 及其对应的数据
                        predict_time = unique_ni.index[i - 1]
                        predict_data = reg_data.ix[:, predict_time, code].drop('NetIncome_ttm')
                        real_time = unique_ni.index[i + horizon - 1]
                        real_data = reg_data.ix['EPS', real_time, code]
                        x = x.append(predict_data)
                        y = y.append(pd.Series(real_data, index=[code]))
                        i += 1
                ###########################################################################################
                model, reg_results = residual_income.execute_reg(y, x)
                # 利用拟合好的模型进行预测
                curr_data = reg_data[:, time, :].drop('NetIncome_ttm', axis=1).dropna(how='any')
                # 对curr_data进行1%的winsorize, 去掉极值
                lower_curr_data = curr_data.quantile(0.01, axis=0)
                upper_curr_data = curr_data.quantile(0.99, axis=0)
                curr_data = curr_data.where(curr_data>=lower_curr_data, lower_curr_data, axis=1). \
                    where(curr_data<=upper_curr_data, upper_curr_data, axis=1)

                curr_data = sm.add_constant(curr_data)
                curr_predicted_eps = curr_data.mul(reg_results.params).sum(1)
                one_time_data[item_str] = curr_predicted_eps
            print(time)
            return one_time_data

        # 进行并行计算
        import pathos.multiprocessing as mp
        if __name__ == '__main__':
            ncpus = 20
            p = mp.ProcessPool(ncpus)
            data_size = np.arange(holding_days.shape[0])
            chunksize = int(len(data_size)/ncpus)
            results = p.map(one_time_eps, data_size, chunksize=chunksize)
            predicted_eps_y1 = pd.concat([i['y1'] for i in results], axis=1).T
            predicted_eps_y2 = pd.concat([i['y2'] for i in results], axis=1).T
            predicted_eps_y3 = pd.concat([i['y3'] for i in results], axis=1).T
            predicted_eps_y1 = predicted_eps_y1.set_index(self.holding_days).reindex(
                index=predicted_eps_all.major_axis, method='ffill')
            predicted_eps_all.ix['y1'] = predicted_eps_y1
            predicted_eps_y2 = predicted_eps_y2.set_index(self.holding_days).reindex(
                index=predicted_eps_all.major_axis, method='ffill')
            predicted_eps_all.ix['y2'] = predicted_eps_y2
            predicted_eps_y3 = predicted_eps_y3.set_index(self.holding_days).reindex(
                index=predicted_eps_all.major_axis, method='ffill')
            predicted_eps_all.ix['y3'] = predicted_eps_y3

            # 向前填充
            predicted_eps_all = predicted_eps_all.fillna(method='ffill', axis=1)

            # 储存数据
            data.write_data(predicted_eps_all)
Beispiel #20
0
    def create_dict(self, docs, match_suggest=False):
        """
            Use tokenizer such as deepcut to get a list of actual words being used in job posting.
            Compare the list with dictionary and collect words that don't appear in the dictionary.
            If the words are misspelled, then mark as possible typos.
            If the words are correctly spelled but are not included in the dictionary, add the word to the dictionary.
        """

        import pathos.multiprocessing as mp
        from nlp_lib import levenshtein as lv
        import os
        import time
        import copy

        def levenshtein(word, wl):
            """
            Take a word, find closest word in dict according to levenshtein distance.
            Return closest word and corresponding minimum edits. (original_word, closest_match, edits)
            """

            start = time.time()
            print('Start: ', os.getpid(), word)

            min_edit = len(word) * 2
            match = None

            for item in wl:
                edit = lv(word, item)
                if edit < min_edit:
                    min_edit = edit
                    match = item

            print('Stop: ', os.getpid(), word, ': ', str(time.time() - start))

            return (word, match, item)

        def find_matching(word):

            if ord(word[0]) in range(3584, 3712):
                current_wl = self.wordlist_th
            else:
                current_wl = self.wordlist_en
            if word in current_wl:
                return (word, word, 0)
            else:
                return (word, word, 1)

        def clean_text(doc):
            return self.clean_text(doc, False)

        if self.wordlist_th and self.wordlist_en:
            pass
        else:
            return None

        start = time.time()

        pool = mp.ProcessPool(nodes=self.processes)
        docs = pool.amap(clean_text, docs)
        docs = docs.get()
        print('Finish cleaning text - time: ', str(time.time() - start))
        if self.token_processes == 1:
            #    tokens = [self.tkn(doc) for doc in docs]
            tokens = []
            for doc in docs:
                doc = doc.replace('|', ' ')
                token = self.TKN.tokenizer(doc)
                tokens.append(token)
        else:
            pool = mp.ProcessPool(nodes=self.processes)
            tokens = pool.amap(self.TKN.tokenizer, docs)
            tokens = tokens.get()
        print('Finish tokenization - time: ', str(time.time() - start))
        temp = copy.deepcopy(tokens)
        tokens = []
        for item in temp:
            tokens.extend(item)
        tokens = [item.lower() for item in tokens if item != '']
        tokens.sort()
        temp = copy.deepcopy(tokens)
        tokens = set(tokens)
        dicts = {}
        for token in tokens:
            dicts[token] = 0
        for doc in temp:
            for token in doc:
                if token in dicts:
                    dicts[token] += 1
        print('Finish compile list - time: ', str(time.time() - start))
        pool = mp.ProcessPool(nodes=self.processes)
        wordlist = pool.amap(find_matching, tokens)
        wordlist = wordlist.get()
        dict_in = {}
        dict_out = {}
        for word in wordlist:
            if word[2] == 0:
                dict_in[word[0]] = dicts[word[0]]
            else:
                dict_out[word[0]] = dicts[word[0]]
        return dict_in, dict_out, temp
Beispiel #21
0
    def construct_optimized_portfolio(self, *, indus_neutral=False):
        global holding_days_g, alpha_g, factor_expo_g, factor_cov_g, spec_var_g, benchmark_g

        holding_days_g = self.holding_days
        alpha_g = self.factor_return
        factor_expo_g = self.strategy_data.factor_expo
        factor_cov_g = self.factor_cov
        spec_var_g = self.spec_var
        # benchmark_g = self.strategy_data.benchmark_price.ix['Weight_'+self.strategy_data.stock_pool]
        benchmark_g = self.strategy_data.benchmark_price.iloc[0]

        if indus_neutral:
            global indus_cons

            indus_name = factor_expo_g.items[10:38].rename(None)
            indus_cons = pd.DataFrame(indus_name, columns=['factor'])
            indus_cons['if_eq'] = True
            indus_cons['if_lower_bound'] = True
            indus_cons['limit'] = 0

        # 定义解单次优化组合的函数
        def one_time_opt_func(cursor):
            curr_time = holding_days_g.iloc[cursor]
            curr_factor_ret = alpha_g.ix[curr_time, :].dropna()
            # 如果换仓当天所有股票都没有因子值, 则返回0序列, 即持有现金
            # 加入这个机制主要是为了防止交易日没有因子值报错的问题
            if curr_factor_ret.isnull().all():
                return pd.Series(0.0, index=alpha_g.columns)
            curr_factor_expo = factor_expo_g.ix[:, curr_time, curr_factor_ret.index].T
            if curr_factor_expo.isnull().all().all():
                return pd.Series(0.0, index=alpha_g.columns)
            curr_factor_cov = factor_cov_g.ix[curr_time]
            curr_spec_var = spec_var_g.ix[curr_time, curr_factor_ret.index]
            curr_bench_weight = benchmark_g.ix[curr_time, curr_factor_ret.index]
            opt = optimizer_utility()

            # 如果当期某个行业所有的股票都是0暴露, 则在行业限制中去除这个行业
            if indus_neutral:
                empty_indus = curr_factor_expo[10:38].fillna(method='bfill').sum(1)==0
                curr_indus_cons = indus_cons[np.logical_not(empty_indus.values)]
                enable_full_inv_cons = False
            else:
                curr_indus_cons = None
                enable_full_inv_cons = True
            # curr_indus_cons = None

            # 不添加任何限制的解最大化IR的组合
            optimized_weight = opt.solve_optimization(curr_bench_weight, curr_factor_expo,
                curr_factor_ret, curr_factor_cov, specific_var=curr_spec_var, factor_expo_cons=curr_indus_cons,
                enable_full_inv_cons=enable_full_inv_cons)

            return optimized_weight.reindex(alpha_g.columns)

        ncpus = 20
        p = mp.ProcessPool(ncpus=ncpus)
        p.close()
        p.restart()
        data_size = np.arange(self.holding_days.shape[0])
        chunksize = int(len(data_size) / ncpus)
        results = p.map(one_time_opt_func, data_size, chunksize=chunksize)
        tar_holding = pd.DataFrame({i: v for i, v in zip(self.holding_days.index, results)}).T
        p.close()
        p.join()

        self.position.holding_matrix = tar_holding.fillna(0.0)
Beispiel #22
0
    def get_short_beta_parallel(self):
        if os.path.isfile(os.path.abspath('.') + '/ResearchData/short_beta' + self.filename_appendix) \
                and not self.is_update and self.try_to_read:
            self.base_data.factor['short_beta'] = data.read_data(
                'short_beta' + self.filename_appendix)
        else:
            # 所有股票的日简单收益的市值加权,加权用前一交易日的市值数据进行加权
            cap_wgt_universe_return = self.base_data.stock_price.ix[
                'daily_excess_simple_return'].mul(
                    self.base_data.stock_price.ix['FreeMarketValue'].shift(
                        1)).div(self.base_data.stock_price.
                                ix['FreeMarketValue'].shift(1).sum(1),
                                axis=0).sum(1)

            # 回归函数
            def reg_func(y, *, x):
                # 如果y全是nan或只有一个不是nan,则直接返回nan,可自由设定阈值
                if y.notnull().sum() <= 5:
                    return np.nan
                x = sm.add_constant(x)
                model = sm.OLS(y, x, missing='drop')
                results = model.fit()
                return results.params[1]

            # 按照Barra的方法进行回归
            # 股票收益的数据
            complete_return_data = self.complete_base_data.stock_price.ix[
                'daily_excess_simple_return']

            # 计算每期beta的函数
            def one_time_beta(cursor):
                # 注意, 这里的股票收益因为要用过去一段时间的数据, 因此要用完整的数据
                curr_data = complete_return_data.ix[cursor - 20:cursor + 1, :]
                curr_x = cap_wgt_universe_return.ix[cursor - 20:cursor + 1]
                temp = curr_data.apply(reg_func, x=curr_x)
                print(cursor)
                return temp

            ncpus = 20
            p = mp.ProcessPool(ncpus)
            p.close()
            p.restart()
            # 一般情况下, 是从21期开始计算beta因子
            # 注意, 在更新的时候, 为了节约时间, 不会算第21到第524个交易日的beta值
            # 因此在更新的时候, 会从第525期开始计算
            if self.is_update:
                start_cursor = 524
            else:
                start_cursor = 20
            data_size = np.arange(
                start_cursor, self.base_data.stock_price.
                ix['daily_excess_simple_return'].shape[0])
            chunksize = int(len(data_size) / ncpus)
            results = p.map(one_time_beta, data_size, chunksize=chunksize)
            # 储存结果i
            beta = pd.concat([i for i in results], axis=1).T
            p.close()
            p.join()
            # 两个数据对应的日期,为原始数据的日期减去20,因为前20期的数据并没有计算
            # 在更新的时候, 则是原始数据日期减去524, 原因同理
            data_index = self.base_data.stock_price.iloc[:, start_cursor -
                                                         self.base_data.
                                                         stock_price.shape[
                                                             1]:, :].major_axis
            beta = beta.set_index(data_index)
            self.base_data.factor['short_beta'] = beta
Beispiel #23
0
def main(argv=None):

    parser = argparse.ArgumentParser(
        description=
        "PINT tool for MCMC optimization of timing models using event data.")

    parser.add_argument("eventfile", help="event file to use")
    parser.add_argument("parfile", help="par file to read model from")
    parser.add_argument("gaussianfile",
                        help="gaussian file that defines template")
    parser.add_argument("--ft2", help="Path to FT2 file.", default=None)
    parser.add_argument(
        "--weightcol",
        help="name of weight column (or 'CALC' to have them computed",
        default=None,
    )
    parser.add_argument("--nwalkers",
                        help="Number of MCMC walkers (def 200)",
                        type=int,
                        default=200)
    parser.add_argument(
        "--burnin",
        help="Number of MCMC steps for burn in (def 100)",
        type=int,
        default=100,
    )
    parser.add_argument(
        "--nsteps",
        help="Number of MCMC steps to compute (def 1000)",
        type=int,
        default=1000,
    )
    parser.add_argument("--minMJD",
                        help="Earliest MJD to use (def 54680)",
                        type=float,
                        default=54680.0)
    parser.add_argument("--maxMJD",
                        help="Latest MJD to use (def 57250)",
                        type=float,
                        default=57250.0)
    parser.add_argument("--phs",
                        help="Starting phase offset [0-1] (def is to measure)",
                        type=float)
    parser.add_argument("--phserr",
                        help="Error on starting phase",
                        type=float,
                        default=0.03)
    parser.add_argument(
        "--minWeight",
        help="Minimum weight to include (def 0.05)",
        type=float,
        default=0.05,
    )
    parser.add_argument(
        "--wgtexp",
        help=
        "Raise computed weights to this power (or 0.0 to disable any rescaling of weights)",
        type=float,
        default=0.0,
    )
    parser.add_argument(
        "--testWeights",
        help="Make plots to evalute weight cuts?",
        default=False,
        action="store_true",
    )
    parser.add_argument(
        "--doOpt",
        help="Run initial scipy opt before MCMC?",
        default=False,
        action="store_true",
    )
    parser.add_argument(
        "--initerrfact",
        help=
        "Multiply par file errors by this factor when initializing walker starting values",
        type=float,
        default=0.1,
    )
    parser.add_argument(
        "--priorerrfact",
        help=
        "Multiple par file errors by this factor when setting gaussian prior widths",
        type=float,
        default=10.0,
    )
    parser.add_argument(
        "--usepickle",
        help="Read events from pickle file, if available?",
        default=False,
        action="store_true",
    )

    global nwalkers, nsteps, ftr

    args = parser.parse_args(argv)

    eventfile = args.eventfile
    parfile = args.parfile
    gaussianfile = args.gaussianfile
    weightcol = args.weightcol

    if args.ft2 is not None:
        # Instantiate Fermi observatory once so it gets added to the observatory registry
        get_satellite_observatory("Fermi", args.ft2)

    nwalkers = args.nwalkers
    burnin = args.burnin
    nsteps = args.nsteps
    if burnin >= nsteps:
        log.error("burnin must be < nsteps")
        sys.exit(1)
    nbins = 256  # For likelihood calculation based on gaussians file
    outprof_nbins = 256  # in the text file, for pygaussfit.py, for instance
    minMJD = args.minMJD
    maxMJD = args.maxMJD  # Usually set by coverage of IERS file

    minWeight = args.minWeight
    do_opt_first = args.doOpt
    wgtexp = args.wgtexp

    # Read in initial model
    modelin = pint.models.get_model(parfile)

    # The custom_timing version below is to manually construct the TimingModel
    # class, which allows it to be pickled. This is needed for parallelizing
    # the emcee call over a number of threads.  So far, it isn't quite working
    # so it is disabled.  The code above constructs the TimingModel class
    # dynamically, as usual.
    # modelin = custom_timing(parfile)

    # Remove the dispersion delay as it is unnecessary
    # modelin.delay_funcs['L1'].remove(modelin.dispersion_delay)
    # Set the target coords for automatic weighting if necessary
    if "ELONG" in modelin.params:
        tc = SkyCoord(
            modelin.ELONG.quantity,
            modelin.ELAT.quantity,
            frame="barycentrictrueecliptic",
        )
    else:
        tc = SkyCoord(modelin.RAJ.quantity,
                      modelin.DECJ.quantity,
                      frame="icrs")

    target = tc if weightcol == "CALC" else None

    # TODO: make this properly handle long double
    if not args.usepickle or (not (os.path.isfile(eventfile + ".pickle") or
                                   os.path.isfile(eventfile + ".pickle.gz"))):
        # Read event file and return list of TOA objects
        tl = fermi.load_Fermi_TOAs(eventfile,
                                   weightcolumn=weightcol,
                                   targetcoord=target,
                                   minweight=minWeight)
        # Limit the TOAs to ones in selected MJD range and above minWeight
        tl = [
            tl[ii] for ii in range(len(tl))
            if (tl[ii].mjd.value > minMJD and tl[ii].mjd.value < maxMJD and (
                weightcol is None or tl[ii].flags["weight"] > minWeight))
        ]
        log.info("There are %d events we will use" % len(tl))
        # Now convert to TOAs object and compute TDBs and posvels
        ts = toa.TOAs(toalist=tl)
        ts.filename = eventfile
        ts.compute_TDBs()
        ts.compute_posvels(ephem="DE421", planets=False)
        ts.pickle()
    else:  # read the events in as a pickle file
        picklefile = toa._check_pickle(eventfile)
        if not picklefile:
            picklefile = eventfile
        ts = toa.TOAs(picklefile)

    if weightcol is not None:
        if weightcol == "CALC":
            weights = np.asarray([x["weight"] for x in ts.table["flags"]])
            log.info("Original weights have min / max weights %.3f / %.3f" %
                     (weights.min(), weights.max()))
            # Rescale the weights, if requested (by having wgtexp != 0.0)
            if wgtexp != 0.0:
                weights **= wgtexp
                wmx, wmn = weights.max(), weights.min()
                # make the highest weight = 1, but keep min weight the same
                weights = wmn + ((weights - wmn) * (1.0 - wmn) / (wmx - wmn))
            for ii, x in enumerate(ts.table["flags"]):
                x["weight"] = weights[ii]
        weights = np.asarray([x["weight"] for x in ts.table["flags"]])
        log.info("There are %d events, with min / max weights %.3f / %.3f" %
                 (len(weights), weights.min(), weights.max()))
    else:
        weights = None
        log.info("There are %d events, no weights are being used." % ts.ntoas)

    # Now load in the gaussian template and normalize it
    gtemplate = read_gaussfitfile(gaussianfile, nbins)
    gtemplate /= gtemplate.mean()

    # Set the priors on the parameters in the model, before
    # instantiating the emcee_fitter
    # Currently, this adds a gaussian prior on each parameter
    # with width equal to the par file uncertainty * priorerrfact,
    # and then puts in some special cases.
    # *** This should be replaced/supplemented with a way to specify
    # more general priors on parameters that need certain bounds
    phs = 0.0 if args.phs is None else args.phs
    fitkeys, fitvals, fiterrs = get_fit_keyvals(modelin,
                                                phs=phs,
                                                phserr=args.phserr)

    for key, v, e in zip(fitkeys[:-1], fitvals[:-1], fiterrs[:-1]):
        if key == "SINI" or key == "E" or key == "ECC":
            getattr(modelin, key).prior = Prior(uniform(0.0, 1.0))
        elif key == "PX":
            getattr(modelin, key).prior = Prior(uniform(0.0, 10.0))
        elif key.startswith("GLPH"):
            getattr(modelin, key).prior = Prior(uniform(-0.5, 1.0))
        else:
            getattr(modelin, key).prior = Prior(
                norm(loc=float(v), scale=float(e * args.priorerrfact)))

    # Now define the requirements for emcee
    ftr = emcee_fitter(ts, modelin, gtemplate, weights, phs, args.phserr)

    # Use this if you want to see the effect of setting minWeight
    if args.testWeights:
        log.info("Checking H-test vs weights")
        ftr.prof_vs_weights(use_weights=True)
        ftr.prof_vs_weights(use_weights=False)
        sys.exit()

    # Now compute the photon phases and see if we see a pulse
    phss = ftr.get_event_phases()
    maxbin, like_start = marginalize_over_phase(phss,
                                                gtemplate,
                                                weights=ftr.weights,
                                                minimize=True,
                                                showplot=False)
    log.info("Starting pulse likelihood: %f" % like_start)
    if args.phs is None:
        fitvals[-1] = 1.0 - maxbin[0] / float(len(gtemplate))
        if fitvals[-1] > 1.0:
            fitvals[-1] -= 1.0
        if fitvals[-1] < 0.0:
            fitvals[-1] += 1.0
        log.info("Starting pulse phase: %f" % fitvals[-1])
    else:
        log.warning("Measured starting pulse phase is %f, but using %f" %
                    (1.0 - maxbin / float(len(gtemplate)), args.phs))
        fitvals[-1] = args.phs
    ftr.fitvals[-1] = fitvals[-1]
    ftr.phaseogram(plotfile=ftr.model.PSR.value + "_pre.png")
    plt.close()
    # ftr.phaseogram()

    # Write out the starting pulse profile
    vs, xs = np.histogram(ftr.get_event_phases(),
                          outprof_nbins,
                          range=[0, 1],
                          weights=ftr.weights)
    f = open(ftr.model.PSR.value + "_prof_pre.txt", "w")
    for x, v in zip(xs, vs):
        f.write("%.5f  %12.5f\n" % (x, v))
    f.close()

    # Try normal optimization first to see how it goes
    if do_opt_first:
        result = op.minimize(ftr.minimize_func, np.zeros_like(ftr.fitvals))
        newfitvals = np.asarray(result["x"]) * ftr.fiterrs + ftr.fitvals
        like_optmin = -result["fun"]
        log.info("Optimization likelihood: %f" % like_optmin)
        ftr.set_params(dict(zip(ftr.fitkeys, newfitvals)))
        ftr.phaseogram()
    else:
        like_optmin = -np.inf

    # Set up the initial conditions for the emcee walkers.  Use the
    # scipy.optimize newfitvals instead if they are better
    ndim = ftr.n_fit_params
    if like_start > like_optmin:
        # Keep the starting deviations small...
        pos = [
            ftr.fitvals +
            ftr.fiterrs * args.initerrfact * np.random.randn(ndim)
            for ii in range(nwalkers)
        ]
        # Set starting params
        for param in [
                "GLPH_1", "GLEP_1", "SINI", "M2", "E", "ECC", "PX", "A1"
        ]:
            if param in ftr.fitkeys:
                idx = ftr.fitkeys.index(param)
                if param == "GLPH_1":
                    svals = np.random.uniform(-0.5, 0.5, nwalkers)
                elif param == "GLEP_1":
                    svals = np.random.uniform(minMJD + 100, maxMJD - 100,
                                              nwalkers)
                    # svals = 55422.0 + np.random.randn(nwalkers)
                elif param == "SINI":
                    svals = np.random.uniform(0.0, 1.0, nwalkers)
                elif param == "M2":
                    svals = np.random.uniform(0.1, 0.6, nwalkers)
                elif param in ["E", "ECC", "PX", "A1"]:
                    # Ensure all positive
                    svals = np.fabs(ftr.fitvals[idx] + ftr.fiterrs[idx] *
                                    np.random.randn(nwalkers))
                    if param in ["E", "ECC"]:
                        svals[svals > 1.0] = 1.0 - (svals[svals > 1.0] - 1.0)
                for ii in range(nwalkers):
                    pos[ii][idx] = svals[ii]
    else:
        pos = [
            newfitvals + ftr.fiterrs * args.initerrfact * np.random.randn(ndim)
            for i in range(nwalkers)
        ]
    # Set the 0th walker to have the initial pre-fit solution
    # This way, one walker should always be in a good position
    pos[0] = ftr.fitvals

    import emcee

    # Following are for parallel processing tests...
    if 0:

        def unwrapped_lnpost(theta, ftr=ftr):
            return ftr.lnposterior(theta)

        import pathos.multiprocessing as mp

        pool = mp.ProcessPool(nodes=8)
        sampler = emcee.EnsembleSampler(nwalkers,
                                        ndim,
                                        unwrapped_lnpost,
                                        pool=pool,
                                        args=[ftr])
    else:
        sampler = emcee.EnsembleSampler(nwalkers, ndim, ftr.lnposterior)
    # The number is the number of points in the chain
    sampler.run_mcmc(pos, nsteps)

    def chains_to_dict(names, sampler):
        chains = [sampler.chain[:, :, ii].T for ii in range(len(names))]
        return dict(zip(names, chains))

    def plot_chains(chain_dict, file=False):
        npts = len(chain_dict)
        fig, axes = plt.subplots(npts, 1, sharex=True, figsize=(8, 9))
        for ii, name in enumerate(chain_dict.keys()):
            axes[ii].plot(chain_dict[name], color="k", alpha=0.3)
            axes[ii].set_ylabel(name)
        axes[npts - 1].set_xlabel("Step Number")
        fig.tight_layout()
        if file:
            fig.savefig(file)
            plt.close()
        else:
            plt.show()
            plt.close()

    chains = chains_to_dict(ftr.fitkeys, sampler)
    plot_chains(chains, file=ftr.model.PSR.value + "_chains.png")

    # Make the triangle plot.
    samples = sampler.chain[:, burnin:, :].reshape((-1, ndim))
    try:
        import corner

        fig = corner.corner(
            samples,
            labels=ftr.fitkeys,
            bins=50,
            truths=ftr.maxpost_fitvals,
            plot_contours=True,
        )
        fig.savefig(ftr.model.PSR.value + "_triangle.png")
        plt.close()
    except ImportError:
        pass

    # Make a phaseogram with the 50th percentile values
    # ftr.set_params(dict(zip(ftr.fitkeys, np.percentile(samples, 50, axis=0))))
    # Make a phaseogram with the best MCMC result
    ftr.set_params(dict(zip(ftr.fitkeys[:-1], ftr.maxpost_fitvals[:-1])))
    ftr.phaseogram(plotfile=ftr.model.PSR.value + "_post.png")
    plt.close()

    # Write out the output pulse profile
    vs, xs = np.histogram(ftr.get_event_phases(),
                          outprof_nbins,
                          range=[0, 1],
                          weights=ftr.weights)
    f = open(ftr.model.PSR.value + "_prof_post.txt", "w")
    for x, v in zip(xs, vs):
        f.write("%.5f  %12.5f\n" % (x, v))
    f.close()

    # Write out the par file for the best MCMC parameter est
    f = open(ftr.model.PSR.value + "_post.par", "w")
    f.write(ftr.model.as_parfile())
    f.close()

    # Print the best MCMC values and ranges
    ranges = map(
        lambda v: (v[1], v[2] - v[1], v[1] - v[0]),
        zip(*np.percentile(samples, [16, 50, 84], axis=0)),
    )
    log.info("Post-MCMC values (50th percentile +/- (16th/84th percentile):")
    for name, vals in zip(ftr.fitkeys, ranges):
        log.info("%8s:" % name + "%25.15g (+ %12.5g  / - %12.5g)" % vals)

    # Put the same stuff in a file
    f = open(ftr.model.PSR.value + "_results.txt", "w")

    f.write("Post-MCMC values (50th percentile +/- (16th/84th percentile):\n")
    for name, vals in zip(ftr.fitkeys, ranges):
        f.write("%8s:" % name + " %25.15g (+ %12.5g  / - %12.5g)\n" % vals)

    f.write("\nMaximum likelihood par file:\n")
    f.write(ftr.model.as_parfile())
    f.close()

    from six.moves import cPickle as pickle

    pickle.dump(samples, open(ftr.model.PSR.value + "_samples.pickle", "wb"))
def secondlevel(inputs, args, secondlevel=False):
    outputs = list()
    if secondlevel:
        input_images = [row[-1] for row in inputs]
    else:
        input_images = [val for sublist in inputs for val in sublist]
    if not is_non_zero_file("output/secondlevel/COMPLETE"):
        # Base command
        command = f"{args.modelbuild_command} -d 3 "
        # Setup directory and naming
        command += "-o output/secondlevel/secondlevel_ "
        # Defaults to bootstrap modelbuilds with rigid prealignmnet, no rigid
        # update
        command += """-r 1 -l 1 -y 0 """
        # Model build setup
        command += "-c {} -a {} -e {} -g {} -i {} -n {} -m {} -t {} -u {} -v {} ".format(
            args.cluster_type,
            args.average_type,
            args.float,
            args.gradient_step,
            args.model_iterations,
            (not secondlevel) and int(args.N4) or "0",
            args.metric,
            args.transform,
            args.walltime,
            args.memory_request,
        )
        # Registrations Setup
        command += "-q {} -f {} -s {} ".format(args.reg_iterations,
                                               args.reg_shrinks,
                                               args.reg_smoothing)
        if args.rigid_model_target:
            command += "-z {} ".format(args.rigid_model_target)
        command += " ".join(input_images)
        command += " && echo DONE > output/secondlevel/COMPLETE"
        results = run_command(command, args.dry_run)
        # Here we should add the ability to limit the number of commands submitted
        if not args.dry_run:
            with open("output/secondlevel/secondlevel.log", "wb") as logfile:
                logfile.write(results.stdout)

    pool = multiprocessing.ProcessPool(nodes=args.local_threads)

    mkdirp("output/jacobians/overall")
    mkdirp("output/compositewarps/secondlevel")
    # Create mask for delin
    run_command(
        "ThresholdImage 3 output/secondlevel/secondlevel_template0.nii.gz output/secondlevel/secondlevel_otsumask.nii.gz Otsu 1",
        args.dry_run,
    )
    # Loop over input file warp fields to produce delin
    jacobians = list()
    print("Processing Second-Level DBM outputs")
    for i, subject in enumerate(tqdm.tqdm(input_images), start=0):
        subjectname = pathlib.Path(subject).name.rsplit(".nii")[0]
        if not is_non_zero_file("output/compositewarps/secondlevel/COMPLETE"):
            commands = list()
            # print(f"Processing subject {subject} DBM outputs")
            # Compute delin
            run_command(
                f"ANTSUseDeformationFieldToGetAffineTransform output/secondlevel/secondlevel_{subjectname}{i}1InverseWarp.nii.gz 0.25 "
                f"affine output/compositewarps/secondlevel/{subjectname}_delin.mat output/secondlevel/secondlevel_otsumask.nii.gz",
                args.dry_run,
            )

            # Create composite field of delin
            commands.append(
                f"antsApplyTransforms -d 3 -t [output/compositewarps/secondlevel/{subjectname}_delin.mat,1] "
                f"-r output/secondlevel/secondlevel_template0.nii.gz --verbose -o [output/compositewarps/secondlevel/{subjectname}_delin.nii.gz,1]"
            )

            # Create composite field of affine
            commands.append(
                f"antsApplyTransforms -d 3 -t [output/secondlevel/secondlevel_{subjectname}{i}0GenericAffine.mat,1] "
                f"-r output/secondlevel/secondlevel_template0.nii.gz --verbose -o [output/compositewarps/secondlevel/{subjectname}_affine.nii.gz,1]"
            )

            # Register final model to common space
            if args.resample_to_common_space:
                commands.append(
                    f"antsRegistrationSyN.sh -d 3 -f {args.resample_to_common_space} -m output/secondlevel/secondlevel_template0.nii.gz -o output/secondlevel/template0_common_space_"
                )

            pool.map(lambda x: run_command(x, args.dry_run), commands)
            commands = list()

            # Generate jacobians of composite affine fields and nonlinear fields
            commands.append(
                f"CreateJacobianDeterminantImage 3 output/secondlevel/secondlevel_{subjectname}{i}1Warp.nii.gz output/jacobians/overall/secondlevel_{subjectname}_nlin.nii.gz 1 1"
            )
            commands.append(
                f"CreateJacobianDeterminantImage 3 output/compositewarps/secondlevel/{subjectname}_delin.nii.gz output/jacobians/overall/secondlevel_{subjectname}_delin.nii.gz 1 1"
            )
            commands.append(
                f"CreateJacobianDeterminantImage 3 output/compositewarps/secondlevel/{subjectname}_affine.nii.gz output/jacobians/overall/secondlevel_{subjectname}_affine.nii.gz 1 1"
            )

            pool.map(lambda x: run_command(x, args.dry_run), commands)
            commands = list()

            commands.append(
                f"ImageMath 3 output/jacobians/overall/secondlevel_{subjectname}_relative.nii.gz + output/jacobians/overall/secondlevel_{subjectname}_nlin.nii.gz output/jacobians/overall/secondlevel_{subjectname}_delin.nii.gz"
            )
            commands.append(
                f"ImageMath 3 output/jacobians/overall/secondlevel_{subjectname}_absolute.nii.gz + output/jacobians/overall/secondlevel_{subjectname}_nlin.nii.gz output/jacobians/overall/secondlevel_{subjectname}_affine.nii.gz"
            )
            pool.uimap(lambda x: run_command(x, args.dry_run), commands)
            commands = list()

        jacobians.append(
            f"output/jacobians/overall/secondlevel_{subjectname}_relative.nii.gz"
        )
        jacobians.append(
            f"output/jacobians/overall/secondlevel_{subjectname}_absolute.nii.gz"
        )
        jacobians.append(
            f"output/jacobians/overall/secondlevel_{subjectname}_nlin.nii.gz")

    run_command("echo DONE > output/compositewarps/secondlevel/COMPLETE",
                args.dry_run)

    for i, subject in enumerate(tqdm.tqdm(input_images), start=0):
        subjectname = pathlib.Path(subject).name.rsplit(".nii")[0]
        if not secondlevel and args.resample_to_common_space:
            commands.append(
                f"antsApplyTransforms -d 3 -i output/jacobians/overall/secondlevel_{subjectname}_relative.nii.gz -t output/secondlevel/template0_common_space_1Warp.nii.gz -t output/secondlevel/template0_common_space_0GenericAffine.mat "
                f"-r {args.resample_to_common_space} --verbose -o output/jacobians/common_space/secondlevel_{subjectname}_relative.nii.gz"
            )
            commands.append(
                f"antsApplyTransforms -d 3 -i output/jacobians/overall/secondlevel_{subjectname}_absolute.nii.gz -t output/secondlevel/template0_common_space_1Warp.nii.gz -t output/secondlevel/template0_common_space_0GenericAffine.mat "
                f"-r {args.resample_to_common_space} --verbose -o output/jacobians/common_space/secondlevel_{subjectname}_absolute.nii.gz"
            )
            commands.append(
                f"antsApplyTransforms -d 3 -i output/jacobians/overall/secondlevel_{subjectname}_nlin.nii.gz -t output/secondlevel/template0_common_space_1Warp.nii.gz -t output/secondlevel/template0_common_space_0GenericAffine.mat "
                f"-r {args.resample_to_common_space} --verbose -o output/jacobians/common_space/secondlevel_{subjectname}_nlin.nii.gz"
            )
            pool.uimap(lambda x: run_command(x, args.dry_run), commands)
            commands = list()

            jacobians.append(
                f"output/jacobians/common_space/secondlevel_{subjectname}_relative.nii.gz"
            )
            jacobians.append(
                f"output/jacobians/common_space/secondlevel_{subjectname}_absolute.nii.gz"
            )
            jacobians.append(
                f"output/jacobians/common_space/secondlevel_{subjectname}_nlin.nii.gz"
            )

    if args.resample_to_common_space:
        mkdirp("output/jacobians/common_space")

    if secondlevel:
        mkdirp("output/compositewarps/groupwise")
        mkdirp("output/jacobians/resampled")
        mkdirp("output/jacobians/groupwise")
        print("Processing First-Level DBM Outputs")
        for subjectnum, row in enumerate(tqdm.tqdm(
            [line[:-1] for line in inputs]),
                                         start=0):
            # Make a mask per subject
            run_command(
                f"ThresholdImage 3 output/subject{subjectnum}/subject{subjectnum}_template0.nii.gz output/subject{subjectnum}/subject{subjectnum}_otsumask.nii.gz Otsu 1",
                args.dry_run,
            )
            for scannum, scan in enumerate(row, start=0):
                commands = list()
                scanname = pathlib.Path(scan).name.rsplit(".nii")[0]
                # print(f"Processing scan {scanname}")
                # Estimate affine residual from nonlinear and create composite warp and jacobian field
                run_command(
                    f"ANTSUseDeformationFieldToGetAffineTransform output/subject{subjectnum}/subject{subjectnum}_{scanname}{scannum}1InverseWarp.nii.gz 0.25 "
                    f"affine output/compositewarps/groupwise/subject{subjectnum}_{scanname}_delin.mat output/subject{subjectnum}/subject{subjectnum}_otsumask.nii.gz",
                    args.dry_run,
                )
                commands.append(
                    f"antsApplyTransforms -d 3 -t [output/compositewarps/groupwise/subject{subjectnum}_{scanname}_delin.mat,1] -r output/subject{subjectnum}/subject{subjectnum}_template0.nii.gz "
                    f"--verbose -o [output/compositewarps/groupwise/subject{subjectnum}_{scanname}_delin.nii.gz,1]"
                )
                # Create composite warp field from affine
                commands.append(
                    f"antsApplyTransforms -d 3 -t [output/subject{subjectnum}/subject{subjectnum}_{scanname}{scannum}0GenericAffine.mat,1] -r output/subject{subjectnum}/subject{subjectnum}_template0.nii.gz "
                    f"--verbose -o [output/compositewarps/groupwise/subject{subjectnum}_{scanname}_affine.nii.gz,1]"
                )

                pool.map(lambda x: run_command(x, args.dry_run), commands)
                commands = list()

                # Create jacobian images from nlin and composite warp fields
                commands.append(
                    f"CreateJacobianDeterminantImage 3 output/subject{subjectnum}/subject{subjectnum}_{scanname}{scannum}1Warp.nii.gz output/jacobians/groupwise/subject{subjectnum}_{scanname}_nlin.nii.gz 1 1"
                )
                commands.append(
                    f"CreateJacobianDeterminantImage 3 output/compositewarps/groupwise/subject{subjectnum}_{scanname}_delin.nii.gz output/jacobians/groupwise/subject{subjectnum}_{scanname}_delin.nii.gz 1 1"
                )
                commands.append(
                    f"CreateJacobianDeterminantImage 3 output/compositewarps/groupwise/subject{subjectnum}_{scanname}_affine.nii.gz output/jacobians/groupwise/subject{subjectnum}_{scanname}_affine.nii.gz 1 1"
                )

                pool.map(lambda x: run_command(x, args.dry_run), commands)
                commands = list()

                # Create relative and absolute jacobians by adding affine/delin jacobians
                commands.append(
                    f"ImageMath 3 output/jacobians/groupwise/subject{subjectnum}_{scanname}_relative.nii.gz + output/jacobians/groupwise/subject{subjectnum}_{scanname}_nlin.nii.gz output/jacobians/groupwise/subject{subjectnum}_{scanname}_delin.nii.gz"
                )
                commands.append(
                    f"ImageMath 3 output/jacobians/groupwise/subject{subjectnum}_{scanname}_absolute.nii.gz + output/jacobians/groupwise/subject{subjectnum}_{scanname}_nlin.nii.gz output/jacobians/groupwise/subject{subjectnum}_{scanname}_affine.nii.gz"
                )

                pool.map(lambda x: run_command(x, args.dry_run), commands)
                commands = list()

                # Resample jacobian to common space
                commands.append(
                    f"antsApplyTransforms -d 3 -i output/jacobians/groupwise/subject{subjectnum}_{scanname}_relative.nii.gz -t output/secondlevel/secondlevel_subject{subjectnum}_template0{subjectnum}1Warp.nii.gz -t output/secondlevel/secondlevel_subject{subjectnum}_template0{subjectnum}0GenericAffine.mat "
                    f"-r output/secondlevel/secondlevel_template0.nii.gz --verbose -o output/jacobians/resampled/subject{subjectnum}_{scanname}_relative.nii.gz"
                )
                commands.append(
                    f"antsApplyTransforms -d 3 -i output/jacobians/groupwise/subject{subjectnum}_{scanname}_absolute.nii.gz -t output/secondlevel/secondlevel_subject{subjectnum}_template0{subjectnum}1Warp.nii.gz -t output/secondlevel/secondlevel_subject{subjectnum}_template0{subjectnum}0GenericAffine.mat "
                    f"-r output/secondlevel/secondlevel_template0.nii.gz --verbose -o output/jacobians/resampled/subject{subjectnum}_{scanname}_absolute.nii.gz"
                )
                commands.append(
                    f"antsApplyTransforms -d 3 -i output/jacobians/groupwise/subject{subjectnum}_{scanname}_nlin.nii.gz -t output/secondlevel/secondlevel_subject{subjectnum}_template0{subjectnum}1Warp.nii.gz -t output/secondlevel/secondlevel_subject{subjectnum}_template0{subjectnum}0GenericAffine.mat "
                    f"-r output/secondlevel/secondlevel_template0.nii.gz --verbose -o output/jacobians/resampled/subject{subjectnum}_{scanname}_nlin.nii.gz"
                )

                if args.resample_to_common_space:
                    commands.append(
                        f"antsApplyTransforms -d 3 -i output/jacobians/groupwise/subject{subjectnum}_{scanname}_relative.nii.gz -t output/secondlevel/template0_common_space_1Warp.nii.gz -t output/secondlevel/template0_common_space_0GenericAffine.mat -t output/secondlevel/secondlevel_subject{subjectnum}_template0{subjectnum}1Warp.nii.gz -t output/secondlevel/secondlevel_subject{subjectnum}_template0{subjectnum}0GenericAffine.mat "
                        f"-r {args.resample_to_common_space} --verbose -o output/jacobians/common_space/subject{subjectnum}_{scanname}_relative.nii.gz"
                    )
                    commands.append(
                        f"antsApplyTransforms -d 3 -i output/jacobians/groupwise/subject{subjectnum}_{scanname}_absolute.nii.gz -t output/secondlevel/template0_common_space_1Warp.nii.gz -t output/secondlevel/template0_common_space_0GenericAffine.mat -t output/secondlevel/secondlevel_subject{subjectnum}_template0{subjectnum}1Warp.nii.gz -t output/secondlevel/secondlevel_subject{subjectnum}_template0{subjectnum}0GenericAffine.mat "
                        f"-r {args.resample_to_common_space} --verbose -o output/jacobians/common_space/subject{subjectnum}_{scanname}_absolute.nii.gz"
                    )
                    commands.append(
                        f"antsApplyTransforms -d 3 -i output/jacobians/groupwise/subject{subjectnum}_{scanname}_nlin.nii.gz -t output/secondlevel/template0_common_space_1Warp.nii.gz -t output/secondlevel/template0_common_space_0GenericAffine.mat -t output/secondlevel/secondlevel_subject{subjectnum}_template0{subjectnum}1Warp.nii.gz -t output/secondlevel/secondlevel_subject{subjectnum}_template0{subjectnum}0GenericAffine.mat "
                        f"-r {args.resample_to_common_space} --verbose -o output/jacobians/common_space/subject{subjectnum}_{scanname}_nlin.nii.gz"
                    )

                pool.uimap(lambda x: run_command(x, args.dry_run), commands)
                commands = list()

                # Append jacobians to list
                jacobians.append(
                    f"output/jacobians/resampled/subject{subjectnum}_{scanname}_relative.nii.gz"
                )
                jacobians.append(
                    f"output/jacobians/resampled/subject{subjectnum}_{scanname}_absolute.nii.gz"
                )
                jacobians.append(
                    f"output/jacobians/resampled/subject{subjectnum}_{scanname}_nlin.nii.gz"
                )
                if args.resample_to_common_space:
                    jacobians.append(
                        f"output/jacobians/common_space/subject{subjectnum}_{scanname}_relative.nii.gz"
                    )
                    jacobians.append(
                        f"output/jacobians/common_space/subject{subjectnum}_{scanname}_absolute.nii.gz"
                    )
                    jacobians.append(
                        f"output/jacobians/common_space/subject{subjectnum}_{scanname}_nlin.nii.gz"
                    )

    commands = list()
    print("Blurring Jacobians")
    for jacobian in jacobians:
        for blur in args.jacobian_sigmas:
            commands.append(
                f"SmoothImage 3 {jacobian} {blur} {jacobian.rsplit('.nii')[0]}_smooth{blur}.nii.gz 1 0"
            )
    for _ in tqdm.tqdm(
            pool.uimap(lambda x: run_command(x, args.dry_run), commands),
            total=len(commands),
    ):
        pass

    pool.close()
    pool.clear()
    print(problem.name)
    problem.print_stats()

    try:
        problem.traffic_matrix.serialize(TM_DIR)
    except Exception:
        print('{} failed'.format(problem.name))
        import traceback
        traceback.printexc()


if __name__ == '__main__':
    PROBLEM_SHORT_NAMES = [
        'gtsce',
        'delta',
        'us-carrier',
        'tata',
        'cogentco',
        'dial',
        'colt',
        'interoute',
        'ion',
        'uninett',
        'kdl',
    ]
    if not os.path.exists(TM_DIR):
        os.makedirs(TM_DIR)

    pool = multiprocessing.ProcessPool(len(PROBLEM_SHORT_NAMES))
    pool.map(generate_traffic_matrix, PROBLEM_SHORT_NAMES)
Beispiel #26
0
    def construct_optimized_portfolio(self, *, indus_neutral=False):
        global holding_days_g, alpha_g, factor_expo_g, factor_cov_g, spec_var_g, benchmark_g, \
               cov_risk_aversion_g, spec_risk_aversion_g

        holding_days_g = self.holding_days
        alpha_g = self.factor_return
        factor_expo_g = self.strategy_data.factor_expo
        factor_cov_g = self.factor_cov
        spec_var_g = self.spec_var
        benchmark_g = self.strategy_data.benchmark_price.iloc[0]

        cov_risk_aversion_g = self.cov_risk_aversion
        spec_risk_aversion_g = self.spec_risk_aversion

        if indus_neutral:
            global indus_cons

            indus_name = factor_expo_g.items[10:38]
            indus_cons = pd.DataFrame(indus_name.values, columns=['factor'])
            indus_cons['if_eq'] = True
            indus_cons['if_lower_bound'] = True
            indus_cons['limit'] = 0.0

        # 定义解单次优化组合的函数
        def one_time_opt_func(cursor):
            curr_time = holding_days_g.iloc[cursor]
            curr_factor_ret = alpha_g.ix[curr_time, :].dropna()
            # 如果换仓当天所有股票都没有因子值, 则返回0序列, 即持有现金
            # 加入这个机制主要是为了防止交易日没有因子值报错的问题
            if curr_factor_ret.isnull().all():
                return pd.Series(0.0, index=alpha_g.columns)
            curr_factor_expo = factor_expo_g.ix[:, curr_time,
                                                curr_factor_ret.index].T
            if curr_factor_expo.isnull().all().all():
                return pd.Series(0.0, index=alpha_g.columns)
            curr_factor_cov = factor_cov_g.ix[curr_time]
            curr_spec_var = spec_var_g.ix[curr_time, curr_factor_ret.index]
            curr_bench_weight = benchmark_g.ix[curr_time,
                                               curr_factor_ret.index]
            opt = optimizer_utility()
            opt.set_opt_package('cvxopt')
            opt.set_risk_aversion(cov_risk_aversion=cov_risk_aversion_g,
                                  spec_risk_aversion=spec_risk_aversion_g)

            # 如果当期某个行业所有的股票都是0暴露, 则在行业限制中去除这个行业
            if indus_neutral:
                empty_indus = curr_factor_expo[10:38].fillna(
                    method='bfill').sum(1) == 0
                curr_indus_cons = indus_cons[np.logical_not(
                    empty_indus.values)]
                enable_full_inv_cons = False
            else:
                curr_indus_cons = None
                enable_full_inv_cons = True
            # curr_indus_cons = None

            # 不添加任何限制的解最大化IR的组合
            opt.solve_optimization(curr_bench_weight,
                                   curr_factor_expo,
                                   curr_factor_cov,
                                   residual_return=curr_factor_ret,
                                   specific_var=curr_spec_var,
                                   factor_expo_cons=curr_indus_cons,
                                   enable_full_inv_cons=enable_full_inv_cons,
                                   asset_cap=None)

            return (opt.optimized_weight.reindex(alpha_g.columns),
                    opt.forecasted_vol)

        ncpus = 20
        p = mp.ProcessPool(ncpus=ncpus)
        p.close()
        p.restart()
        data_size = np.arange(self.holding_days.shape[0])
        chunksize = int(len(data_size) / ncpus)
        results = p.map(one_time_opt_func, data_size, chunksize=chunksize)
        tar_holding = pd.DataFrame(
            {i: v[0]
             for i, v in zip(self.holding_days.index, results)}).T
        forecasted_vol = pd.Series(
            {i: v[1]
             for i, v in zip(self.holding_days.index, results)})
        p.close()
        p.join()

        self.position.holding_matrix = tar_holding.fillna(0.0)
        # 不再将停牌部分持有为现金, 而是全部持有股票, 将股票归一化
        self.position.holding_matrix = self.position.holding_matrix.apply(
            position.to_percentage_func, axis=1)
        # # 剩下的部分是现金
        # self.position.cash = 1 - self.position.holding_matrix.sum(1)
        self.forecasted_vol = forecasted_vol
def single_closure(G_old,
                   G_new,
                   Ops,
                   MaxNew=-1,
                   Progress=True,
                   Search=None):  # {{{
    # G_old is a set of elements. G_new has been computed by taking G_old and
    # applying single functions from Ops to it. It should be disjoint from G_old.
    # We return G_newer, which is the result of applying functions from Ops to
    # inputs from G_old and G_new, with at least 1 element of G_new as input. It
    # will be disjoint from G_old+G_new.

    with MP.ProcessPool() as pool:
        G_newer = FancySet()
        op_total = len(Ops)
        for op_count, op in enumerate(Ops):
            args_total = (len(G_old) +
                          len(G_new))**op.arity - len(G_old)**op.arity
            args_count = 0
            # We don't want to evaluate op(all G_old), so we do all combinations of
            # elements from G_old and G_new, with at least 1 G_new always appearing.
            # We do this by quantifying over all variable positions where a new
            # element will appear.
            all_old = [0] * op.arity
            for vars_old_new in powerset_as_indicators(op.arity):
                # skip op(all G_old) *unless* we have been doing a MaxNew style
                # generation, in which case G_old need not be closed under op.
                if vars_old_new == all_old and MaxNew < 0:
                    continue
                args_all = product(*[[G_old, G_new][var]
                                     for var in vars_old_new])
                # we have to seek the arguments that gave us result... not a good soln...
                args_all_seek = product(*[[G_old, G_new][var]
                                          for var in vars_old_new])
                prev_args_index = -1
                for args_index, result in enumerate(pool.imap(op, args_all)):
                    # if result is something new
                    if not (result in G_old or result in G_new
                            or result in G_newer):
                        args = next(
                            islice(args_all_seek,
                                   args_index - prev_args_index - 1, None))
                        prev_args_index = args_index
                        G_newer.add(result, op.pprint(*args))
                        if Search != None and Search(result):
                            stdout.write("\nFound:\n" + op.pprint(*args) +
                                         "\n")
                    args_count += 1
                    if Progress and args_count % 10000 == 0:
                        stdout.write( "\roperation " + op.name \
                            + " " + str(op_count+1) + " / " + str(op_total) \
                            + ", argument " + str(args_count) + " / " + str(args_total) \
                            + " ~ " + str( round( args_count/args_total*100, 4 ) ) + "%" \
                            + ", new elements: " + str(len(G_newer)) + " "*2 )
                        stdout.flush()
                    if 0 < MaxNew <= len(
                            G_newer
                    ):  # return if found the max number of new elements
                        return G_newer
        if Progress:
            stdout.write("  done.\n")
        return G_newer
Beispiel #28
0
def parallelize(fn, list_args, num_pools=NUM_POOLS):
    pool = multiprocessing.ProcessPool(num_pools)
    results = pool.amap(fn, list_args)
    while not results.ready():
        time.sleep(5)
    pool.close()
Beispiel #29
0
    def exec(self,
             samFile,
             oFunc,
             sEnvironment,
             chunkSize=1000,
             pReduceFunc=None):

        self.pool = mp.ProcessPool(self.nprocs)
        allResults = []

        resultObj = None

        samHeader = samFile.header

        hasEnded = False

        while len(allResults) < self.nprocs:

            samChunk = []
            for aln in samFile:

                samChunk.append(aln.to_string())

                if len(samChunk) >= chunkSize:
                    break

            if len(samChunk) != chunkSize:
                hasEnded = True

            allResults.append(
                self.pool.apipe(oFunc, samHeader, samChunk, sEnvironment))

        if hasEnded:
            self.pool.close()

        while len(allResults) > 0:

            i = 0
            while i < len(allResults):

                if allResults[i].ready():

                    result = allResults[i].get()

                    if pReduceFunc != None:

                        resultObj = pReduceFunc(resultObj, result,
                                                sEnvironment)

                    else:

                        if resultObj == None:
                            resultObj = []

                        resultObj.append(result)

                    del allResults[i]
                    """
                    FILL UP THE QUEUE
                    """

                    if not hasEnded:
                        while len(allResults) < self.nprocs:

                            samChunk = []
                            for aln in samFile:

                                samChunk.append(aln.to_string())

                                if len(samChunk) >= chunkSize:
                                    break

                            if len(samChunk) != chunkSize:
                                hasEnded = True

                        if hasEnded:
                            self.pool.close()

                else:
                    i += 1

            time.sleep(0.5)

        print("Pool Join")
        self.pool.join()
        print("Pool Clear")
        self.pool.clear()
        print("Pool Closed")

        return resultObj
Beispiel #30
0
    def get_beta_parallel(self):
        if os.path.isfile('beta.csv') and not self.is_update:
            beta = data.read_data(['beta'], ['beta'])
            self.bb_data.factor['beta'] = beta.ix['beta']
        else:
            # 所有股票的日对数收益的市值加权,加权用前一交易日的市值数据进行加权
            cap_wgt_universe_return = self.bb_data.stock_price.ix[
                'daily_excess_return'].mul(
                    self.bb_data.stock_price.ix['FreeMarketValue'].shift(
                        1)).div(self.bb_data.stock_price.ix['FreeMarketValue'].
                                shift(1).sum(1),
                                axis=0).sum(1)

            # 指数权重
            exponential_weights = barra_base.construct_expo_weights(63, 252)

            # 回归函数
            def reg_func(y, *, x, weights):
                # 如果y全是nan或只有一个不是nan,则直接返回nan,可自由设定阈值
                if y.notnull().sum() <= 100:
                    return pd.Series({'beta': np.nan, 'hsigma': np.nan})
                x = sm.add_constant(x)
                model = sm.WLS(y, x, weights=weights, missing='drop')
                results = model.fit()
                resid = results.resid
                # 在这里提前计算hsigma----------------------------------------------------------------------
                # 求std,252个交易日,63的半衰期
                exponential_weights_h = barra_base.construct_expo_weights(
                    63, 252)
                # 给weights加上index以索引resid
                exponential_weights_h = pd.Series(exponential_weights_h,
                                                  index=y.index)
                hsigma = (resid * exponential_weights_h).std()
                # ----------------------------------------------------------------------------------------
                return pd.Series({'beta': results.params[1], 'hsigma': hsigma})

            # 按照Barra的方法进行回归
            # 计算每期beta的函数
            def one_time_beta(cursor):
                curr_data = self.bb_data.stock_price.ix['daily_excess_return',
                                                        cursor - 251:cursor +
                                                        1, :]
                curr_x = cap_wgt_universe_return.ix[cursor - 251:cursor + 1]
                temp = curr_data.apply(reg_func,
                                       x=curr_x,
                                       weights=exponential_weights)
                print(cursor)
                return temp

            import pathos.multiprocessing as mp
            if __name__ == '__main__':
                ncpus = 16
                p = mp.ProcessPool(ncpus)
                # 从252期开始
                data_size = np.arange(
                    251, self.bb_data.stock_price.ix['daily_excess_return'].
                    shape[0])
                chunksize = int(len(data_size) / ncpus)
                results = p.map(one_time_beta, data_size, chunksize=chunksize)
                # 储存结果
                beta = pd.concat([i.ix['beta'] for i in results], axis=1).T
                hsigma = pd.concat([i.ix['hsigma'] for i in results], axis=1).T
                # 两个数据对应的日期,为原始数据的日期减去251,因为前251期的数据并没有计算
                data_index = self.bb_data.stock_price.iloc[:,
                                                           251 - self.bb_data.
                                                           stock_price.shape[
                                                               1]:, :].major_axis
                beta = beta.set_index(data_index)
                hsigma = hsigma.set_index(data_index)
                self.bb_data.factor['beta'] = beta
                self.temp_hsigma = hsigma.reindex(
                    self.bb_data.stock_price.major_axis)