def quantiles(self, *quantiles): """Return the estimated data value for the given quantile(s). The requested quantile(s) must be between 0 and 1. Note that even if a single quantile is input, a list is always returned. """ temp = bin_sums(self.bins) sums = list(accumulate(temp)) result = [] for x in quantiles: target_sum = x * self.total if x <= 0: qq = self._min elif x >= self.total: qq = self._max else: index = bisect_left(sums, target_sum) bin_i = self.bins[index] if index < len(sums): bin_i1 = self.bins[index+1] else: bin_i1 = self.bins[index] if index: prev_sum = sums[index-1] else: prev_sum = 0.0 qq = _compute_quantile(target_sum, bin_i, bin_i1, prev_sum+1) result.append(qq) return result
def quantiles(self, *quantiles): """Return the estimated data value for the given quantile(s). The requested quantile(s) must be between 0 and 1. Note that even if a single quantile is input, a list is always returned. """ temp = bin_sums(self.bins) sums = list(accumulate(temp)) result = [] for x in quantiles: target_sum = x * self.total if x <= 0: qq = self._min elif x >= self.total: qq = self._max else: index = bisect_left(sums, target_sum) bin_i = self.bins[index] if index < len(sums): bin_i1 = self.bins[index + 1] else: bin_i1 = self.bins[index] if index: prev_sum = sums[index - 1] else: prev_sum = 0.0 qq = _compute_quantile(target_sum, bin_i, bin_i1, prev_sum + 1) result.append(qq) return result
def generate(self, ffrom, tto): period_type = self.get_period_type() sum_type = self.config.get("sum_type", "sum") ret, new_ffrom, new_tto = self._reused_data(ffrom, tto) if new_ffrom is None and new_tto is None: # full reuse - ret == old_data return ret query = self._get_query() with self.db.cursor() as cursor: for period_from in generate_date_series(new_ffrom, new_tto, period_type): period_to = calculate_period_to(period_from, period_type) params = { 'ffrom': period_from, 'tto': period_to, "period": period_type } params.update(self.get_params()) try: cursor.execute(query, params) except: import ipdb ipdb.set_trace() rrow = cursor.fetchone() if not rrow: ret[period_from] = 0 else: ret[period_from] = rrow["value"] totalized_movs = sorted(ret.items(), key=lambda k_v: k_v[0]) accumulate(totalized_movs, sum_type) if [row for row in totalized_movs if row[1] is None]: import ipdb ipdb.set_trace() data = [{ "label": row[0].strftime("%Y-%m-%d"), "data": float(row[1]) } for row in totalized_movs] return data
def generate(self, ffrom, tto): period_type = self.get_period_type() sum_type = self.config.get("sum_type", "sum") ret, new_ffrom, new_tto = self._reused_data(ffrom, tto) if new_ffrom is None and new_tto is None: # full reuse - ret == old_data return ret query = self._get_query() with self.db.cursor() as cursor: params = { 'ffrom': new_ffrom, 'tto': new_tto, "period": period_type } params.update(self.get_params()) try: cursor.execute(query, params) except: import ipdb ipdb.set_trace() raise for row in cursor.fetchall(): ret[ensure_date(row["period"])] = row["value"] # Fill missing dates with zeros missing_dates = [ d for d in generate_date_series(new_ffrom, new_tto, period_type) if d not in ret ] for missing in missing_dates: ret[missing] = 0 totalized_movs = sorted(ret.items(), key=lambda k_v: k_v[0]) accumulate(totalized_movs, sum_type) data = [{ "label": row[0].strftime("%Y-%m-%d"), "data": float(row[1]) } for row in totalized_movs] return data
def calculate_gae(rewards, values, gamma, lamb, normalize=True): td_errors = calculate_td_errors(rewards, values, gamma) discount_rate = gamma * lamb advantages = accumulate(td_errors, discount_rate) future_returns = calculate_future_rewards(rewards, gamma) if normalize: advantages = batch_normalize(advantages) return advantages, future_returns
def generate(self, ffrom, tto): account_codes = self.config["account_codes"].split(",") accounts_plan_id = self.config["accounts_plan_id"] journal = self.config["journal"] sum_type = self.config["sum_type"] sign = self.config["sign"] account_ids = self._get_account_ids(accounts_plan_id, account_codes) totalized_movs = self._sumarize_movements(ffrom, tto, account_ids, journal) accumulate(totalized_movs, sum_type) data = [{ "label": row[0].strftime("%Y-%m-%d"), "data": sign * int(row[1]) } for row in totalized_movs] return data
def get(self): btc_rpc_connection = AuthServiceProxy(BTC_RPC_URL)#todo-junying-20180325 try: addr = self.get_argument("address") data = BTC_ListUTXO.utxo(btc_rpc_connection,addr) if not data: self.write(json.dumps(BaseHandler.error_ret_with_data("utxo no available"))) from utils import accumulate self.write(json.dumps(BaseHandler.success_ret_with_data(accumulate(data)), default=decimal_default)) except Exception as e: self.write(json.dumps(BaseHandler.error_ret_with_data("error: %s"%e))) print("BTC_GetBalance error:{0} in {1}".format(e,get_linenumber()))
def get(self): btc_rpc_connection = AuthServiceProxy(RPC_URL) try: addr = self.get_argument("address") data = BTC_ListUTXO.utxo(btc_rpc_connection, addr) if not data: self.write(json.dumps(BaseHandler.error_ret_with_data("0"))) return from utils import accumulate self.write( json.dumps(BaseHandler.success_ret_with_data('%.8f' % accumulate(data)), default=decimal_default)) except Exception as e: self.write( json.dumps(BaseHandler.error_ret_with_data("error: %s" % e))) logging.error("BTC_GetBalance error:{0} in {1}".format( e, get_linenumber()))
def get(self): omni_rpc_connection = AuthServiceProxy(OMNI_RPC_URL) try: addr = self.get_argument("address") #print("addr" + str(addr)) data = uBTC_ListUTXO.utxo(omni_rpc_connection, self.get_argument("address"), 0, 99999) if not data: self.write( json.dumps( BaseHandler.error_ret_with_data("utxo no available"))) return from utils import accumulate self.write( json.dumps(BaseHandler.success_ret_with_data('%.8f' % accumulate(data)), default=decimal_default)) except Exception as e: self.write( json.dumps(BaseHandler.error_ret_with_data("error: %s" % e))) logging.error("uBTC_GetBalance error:{0} in {1}".format( e, get_linenumber()))
def train(P, opt, models, optimizers, train_loader, logger): generator, discriminator, GD, g_ema = models opt_G, opt_D = optimizers losses = { 'G_loss': [], 'D_loss': [], 'D_penalty': [], 'D_real': [], 'D_gen': [], 'D_r1': [] } metrics = {} metrics['image_grid'] = ImageGrid(volatile=P.no_gif) metrics['fixed_gen'] = FixedSampleGeneration(g_ema, volatile=P.no_gif) if not P.no_fid: metrics['fid_score'] = FIDScore(opt['dataset'], opt['fid_size'], P.n_eval_avg) logger.log_dirname("Steps {}".format(P.starting_step)) for step in range(P.starting_step, opt['max_steps'] + 1): d_regularize = (step % P.d_reg_every == 0) and (P.lbd_r1 > 0) if P.use_warmup: _update_warmup(opt_G, step, opt["warmup"], opt["lr"]) _update_warmup(opt_D, step, opt["warmup"], opt["lr_d"]) if (not P.use_warmup) or step > opt["warmup"]: cur_lr_g = _update_lr(opt_G, step, opt["batch_size"], P.halflife_lr, opt["lr"]) cur_lr_d = _update_lr(opt_D, step, opt["batch_size"], P.halflife_lr, opt["lr_d"]) if cur_lr_d and cur_lr_g: logger.log('LR Updated: [G %.5f] [D %.5f]' % (cur_lr_g, cur_lr_d)) do_ema = (step * opt['batch_size']) > (P.ema_start_k * 1000) accum = P.accum if do_ema else 0 accumulate(g_ema, generator, accum) generator.train() discriminator.train() images, labels = next(train_loader) images = images.cuda() set_grad(generator, True) set_grad(discriminator, False) d_gen = GD(P, images, train_G=True) g_loss = _loss_G_fn(d_gen) opt_G.zero_grad() g_loss.backward() opt_G.step() losses['G_loss'].append(g_loss.item()) set_grad(generator, False) set_grad(discriminator, True) d_all, view_r, view_f = GD(P, images) d_loss, aux = _loss_D_fn(P, d_all, view_r, view_f) loss = d_loss + aux['penalty'] if d_regularize: r1 = GD(P, images, return_r1_loss=True).mean() lazy_r1 = (0.5 * P.lbd_r1) * r1 * P.d_reg_every loss = loss + lazy_r1 losses['D_r1'].append(r1.item()) opt_D.zero_grad() loss.backward() opt_D.step() losses['D_loss'].append(d_loss.item()) losses['D_real'].append(aux['d_real'].item()) losses['D_gen'].append(aux['d_gen'].item()) losses['D_penalty'].append(aux['penalty'].item()) for i in range(opt['n_critic'] - 1): images, labels = next(train_loader) images = images.cuda() d_all, view_r, view_f = GD(P, images) d_loss, aux = _loss_D_fn(P, d_all, view_r, view_f) loss = d_loss + aux['penalty'] opt_D.zero_grad() loss.backward() opt_D.step() generator.eval() discriminator.eval() if step % P.print_every == 0: logger.log('[Steps %7d] [G %.3f] [D %.3f]' % (step, losses['G_loss'][-1], losses['D_loss'][-1])) for name in losses: values = losses[name] if len(values) > 0: logger.scalar_summary('gan/train/' + name, values[-1], step) if step % P.evaluate_every == 0: logger.log_dirname("Steps {}".format(step + 1)) fid_score = metrics.get('fid_score') fixed_gen = metrics.get('fixed_gen') image_grid = metrics.get('image_grid') if fid_score: fid_avg = fid_score.update(step, g_ema) fid_score.save(logger.logdir + f'/results_fid_{P.eval_seed}.csv') logger.scalar_summary('gan/test/fid', fid_avg, step) logger.scalar_summary('gan/test/fid/best', fid_score.best, step) if not P.no_gif: _ = fixed_gen.update(step) imageio.mimsave( logger.logdir + f'/training_progress_{P.eval_seed}.gif', fixed_gen.summary()) aug_grid = image_grid.update(step, P.augment_fn(images)) imageio.imsave(logger.logdir + f'/real_augment_{P.eval_seed}.jpg', aug_grid) G_state_dict = generator.state_dict() D_state_dict = discriminator.state_dict() Ge_state_dict = g_ema.state_dict() torch.save(G_state_dict, logger.logdir + '/gen.pt') torch.save(D_state_dict, logger.logdir + '/dis.pt') torch.save(Ge_state_dict, logger.logdir + '/gen_ema.pt') if fid_score and fid_score.is_best: torch.save(G_state_dict, logger.logdir + '/gen_best.pt') torch.save(D_state_dict, logger.logdir + '/dis_best.pt') torch.save(Ge_state_dict, logger.logdir + '/gen_ema_best.pt') if step % P.save_every == 0: torch.save(G_state_dict, logger.logdir + f'/gen_{step}.pt') torch.save(D_state_dict, logger.logdir + f'/dis_{step}.pt') torch.save(Ge_state_dict, logger.logdir + f'/gen_ema_{step}.pt') torch.save( { 'epoch': step, 'optim_G': opt_G.state_dict(), 'optim_D': opt_D.state_dict(), }, logger.logdir + '/optim.pt')
return user def get_password(): pwd = os.environ.get("spark_password", None) if pwd is None: pwd = getpass("No password configured, Type password: "******"__main__": username = get_username() password = get_password() sparkClient = SparkClient(username, password) account = sparkClient.get_account() balance = sparkClient.get_balance(account.key) holdings = sparkClient.get_holdings(account.key) print(account) print_currency(prefix="Portfolio value", value=balance.portfolio_value) print_currency(prefix="Holdings value", value=accumulate(holdings, lambda a: a.current_value)) print_currency(prefix="Accumulative profit", value=accumulate(holdings, lambda a: a.profit), text_format=red_green_color) print_currency(prefix="Remaining cash", value=balance.remaining_cash) for holding in holdings: print(holding)
def run(self): try: # setting variables and constants model = self.model generator = model.generator.train() g_running = model.g_running discriminator = model.discriminator n_frames_discriminator = model.n_frames_discriminator g_optimizer = model.g_optimizer d_optimizer = model.d_optimizer nfd_optimizer = model.nfd_optimizer used_samples = model.used_samples step = model.step resolution = model.resolution iteration = model.iteration n_critic = constants.N_CRITIC config = self.config code_size = config.get('code_size', constants.DEFAULT_CODE_SIZE) lr = config.get('lr', constants.LR) batch_size = config.get('batch_size', constants.BATCH_SIZE) init_size = config.get('init_size', constants.INIT_SIZE) n_gen_steps = config.get('n_gen_steps', 1) max_size = config['max_size'] max_iterations = config.get('max_iterations', constants.MAX_ITERATIONS) samples_per_phase = config['samples_per_phase'] loss_fn = config['loss_fn'] n_frames_params = config.get('n_frames_params', dict()) n_frames = n_frames_params.get('n', 1) n_frames_loss_coef = n_frames_params.get('loss_coef', 0) n_frames_final_freq = n_frames_params.get('final_freq', 0) n_frames_decay_duration = n_frames_params.get('decay_duration', 0) crop_freq = n_frames_params.get('crop_freq', 0) mixing = config.get('mixing', False) # getting data cur_batch_size = batch_size[resolution] images_dataloader = CycleLoader(self.images_dataset, cur_batch_size, resolution) if n_frames_loss_coef > 0: n_frames_dataloader = CycleLoader(self.n_frames_dataset, cur_batch_size, resolution) if crop_freq > 0: n_crops_dataloader = CycleLoader(self.n_crops_dataset, cur_batch_size, resolution) if iteration == 0: self.adjust_lr(lr, resolution) pbar = tqdm.trange(iteration, max_iterations, initial=iteration) requires_grad(generator, False) requires_grad(discriminator, True) discr_loss_val = 0 gen_loss_val = 0 grad_loss_val = 0 max_step = int(math.log2(max_size)) - 2 final_progress = False for iteration in pbar: model.iteration = iteration # update alpha, step and resolution alpha = min(1, 1 / samples_per_phase * (used_samples + 1)) if resolution == init_size or final_progress: alpha = 1 if not final_progress and used_samples > samples_per_phase * 2: LOGGER.debug(f'Used samples: {used_samples}.') used_samples = 0 step += 1 if step > max_step: step = max_step final_progress = True LOGGER.info('Final progress.') else: alpha = 0 LOGGER.info( f'Changing resolution from {resolution} to {resolution * 2}.' ) resolution = 4 * 2**step model.step = step model.resolution = resolution model.used_samples = used_samples LOGGER.debug( f'Used samples on saving: {model.used_samples}.') self.save_model(step=step) self.adjust_lr(lr, resolution) # setup loaderts cur_batch_size = batch_size[resolution] images_dataloader = CycleLoader(self.images_dataset, cur_batch_size, resolution) if n_frames_loss_coef > 0: n_frames_dataloader = CycleLoader( self.n_frames_dataset, cur_batch_size, resolution) if crop_freq > 0: n_crops_dataloader = CycleLoader( self.n_crops_dataset, cur_batch_size, resolution) # decide if need to use n_frames on this iteration if final_progress or n_frames_decay_duration == 0: n_frames_freq = n_frames_final_freq else: n_frames_freq = 0.5 - min(1, used_samples / n_frames_decay_duration) *\ (0.5 - n_frames_final_freq) n_frames_iteration = True if random.random( ) < n_frames_freq else False if n_frames_iteration: cur_discr = n_frames_discriminator cur_dataloader = n_frames_dataloader cur_n_frames = n_frames cur_d_optimizer = nfd_optimizer else: cur_discr = discriminator cur_dataloader = images_dataloader cur_n_frames = 1 cur_d_optimizer = d_optimizer cur_discr.zero_grad() real_image = next(cur_dataloader) LOGGER.debug(f'n_frames iteration: {n_frames_iteration}') LOGGER.debug(f'cur_discr: {type(cur_discr.module)}') LOGGER.debug( f'real_image shape {real_image.shape}; resolution {resolution}' ) # discriminator step real_predict, real_grad_loss_val = discr_backward_real( cur_discr, loss_fn, real_image, step, alpha) if mixing and random.random() < 0.9: num_latents = 2 else: num_latents = 1 LOGGER.debug(f'Batch size: {cur_batch_size}') latents = get_latents(cur_batch_size, code_size, 2 * num_latents) gen_in1 = latents[:num_latents] gen_in2 = latents[num_latents:] LOGGER.debug(f'Latents shape: {gen_in1[0].shape}') fake_image = generator(gen_in1, step=step, alpha=alpha, n_frames=cur_n_frames) crop_iteration = False if n_frames_iteration: if random.random() < crop_freq: crop_iteration = True fake_image = next(n_crops_dataloader) discr_loss_val, fake_grad_loss_val = discr_backward_fake( cur_discr, loss_fn, fake_image, real_image, real_predict, step, alpha, False) grad_loss_val = real_grad_loss_val or fake_grad_loss_val cur_d_optimizer.step() # generator step if (iteration + 1) % n_critic == 0: for gen_step in range(n_gen_steps): generator.zero_grad() requires_grad(generator, True) requires_grad(cur_discr, False) fake_image = generator(gen_in2, step=step, alpha=alpha, n_frames=cur_n_frames) LOGGER.debug( f'fake image shape when gen {fake_image.shape}') predict = cur_discr(fake_image, step=step, alpha=alpha) if loss_fn == 'wgan-gp': loss = -predict.mean() elif loss_fn == 'r1': loss = F.softplus(-predict).mean() if n_frames_iteration: loss *= n_frames_loss_coef gen_loss_val = loss.item() loss.backward() g_optimizer.step() LOGGER.debug('generator optimizer step') accumulate(to_model=g_running, from_model=generator.module) requires_grad(generator, False) requires_grad(cur_discr, True) used_samples += real_image.shape[0] model.used_samples = used_samples if (iteration + 1) % constants.SAMPLE_FREQUENCY == 0: LOGGER.info( f'Saving samples on {iteration + 1} iteration.') save_sample(generator=g_running, alpha=alpha, step=step, code_size=code_size, resolution=resolution, save_dir=os.path.join(self.sample_dir), name=f'{str(iteration + 1).zfill(6)}', sample_size=constants.SAMPLE_SIZE, images_n_frames=n_frames, video_n_frames=32) if (iteration + 1) % constants.SAVE_FREQUENCY == 0: self.save_model(iteration=iteration + 1) if n_frames_iteration: prefix = 'NF' suffix = 'n_frames' else: prefix = '' suffix = 'loss' state_msg = f'Size: {resolution}; {prefix}G: {gen_loss_val:.3f}; {prefix}D: {discr_loss_val:.3f}; ' +\ f'{prefix}Grad: {grad_loss_val:.3f}; Alpha: {alpha:.5f}' pbar.set_description(state_msg) if iteration % constants.LOG_LOSS_FREQUENCY == 0: self.summary_writer.add_scalar('size', resolution, iteration) self.summary_writer.add_scalar(f'G/{suffix}', gen_loss_val, iteration) self.summary_writer.add_scalar(f'D/{suffix}', discr_loss_val, iteration) self.summary_writer.add_scalar(f'Grad/{suffix}', grad_loss_val, iteration) self.summary_writer.add_scalar('alpha', alpha, iteration) if n_frames_iteration and crop_freq > 0: if crop_iteration: suffix = 'crop' else: suffix = 'no_crop' self.summary_writer.add_scalar(f'D/{suffix}', discr_loss_val, iteration) except KeyboardInterrupt: LOGGER.warning('Interrupted by user') self.save_model(iteration=iteration)
def calculate_future_rewards(rewards, gamma): """rewards is list of episodes where length of list is max_episode_length. And each elements is rewards of each batches. So, shape of the rewards becomes [max_episode_length, batch_size]""" return accumulate(rewards, gamma)
def generate(self, ffrom, tto): sucursal_id = self.config["sucursal"] sucursal_negated = False if sucursal_id == "all": sucursal_id = None elif sucursal_id.startswith("-"): sucursal_negated = True period_type = self.config["period"] ret, new_ffrom, new_tto = self._reused_data(ffrom, tto) if new_ffrom is None and new_tto is None: # full reuse - ret == old_data return ret sum_type = self.config["sum_type"] if sucursal_negated: equal = "!=" else: equal = "=" query = """SELECT date_trunc('%s', S.real_date) AS period, SUM(S.amount) AS amount FROM ( SELECT D.real_date, getBuyQuotationAt(SB.currency_id, D.real_date) * SB.amount AS amount FROM sale_bill SB INNER JOIN document D ON D.document_id = SB.document_id WHERE NOT SB.cancelled AND (%%(sucursal_id)s IS NULL OR D.sucursal_id %s %%(sucursal_id)s) AND D.real_date BETWEEN %%(ffrom)s AND %%(tto)s UNION ALL SELECT D.real_date, getBuyQuotationAt(SCN.currency_id, D.real_date) * -SCN.amount AS amount FROM sale_credit_note SCN INNER JOIN document D ON D.document_id = SCN.document_id WHERE NOT SCN.cancelled AND (%%(sucursal_id)s IS NULL OR D.sucursal_id %s %%(sucursal_id)s) AND D.real_date BETWEEN %%(ffrom)s AND %%(tto)s ) AS S GROUP BY period ORDER BY period """ % (period_type, equal, equal) with self.db.cursor() as cursor: cursor.execute( """ SELECT generate_series AS period, 0 AS amount FROM generate_series(%%(ffrom)s, %%(tto)s, interval '1 %s') """ % period_type, { 'ffrom': new_ffrom, 'tto': new_tto }) ret.update( dict((ensure_date(row["period"]), row["amount"]) for row in cursor.fetchall())) cursor.execute(query, { "sucursal_id": sucursal_id, 'ffrom': new_ffrom, 'tto': new_tto }) for row in cursor.fetchall(): ret[ensure_date(row["period"])] = row["amount"] totalized_movs = sorted(ret.items(), key=lambda k_v: k_v[0]) accumulate(totalized_movs, sum_type) data = [{ "label": row[0].strftime("%Y-%m-%d"), "data": float(row[1]) } for row in totalized_movs] return data
def process_consecutive_blocks(contigs_group, soi, chr_, snp_threshold, sample_list, num_of_hets, lods_cut_off, writelod, maxed_as): #print() print( ' - Grouping the dataframe using unique "PI - phased index" values. ') ''' Step 02 - D: group dataframe again by "PI keys" of soi and then sort by minimum "POS" value for each "PI key". - This sorting is necessary because sometimes "haplotype blocks" are like 3-3-3-3 5-5-5 3-3-3-3 - i.e there are small RBphased blocks within the boundry of larger RBphased block. - Not, sure what is causing this (prolly sampling difference of large vs. small chunks in PE reads) - This problem should go away in first round of haplotype-extension''' contigs_group = contigs_group. \ assign(New=contigs_group.groupby([soi + ':PI']). POS.transform('min')).sort_values(['New', 'POS']) ''' Step 03: Now, start reading the "contigs_group" for haplotype-extension. A) Store the data as dictionary with 'header' values as keys. Some keys are: CHROM, POS, sample (PI, PG within sample), etc ... Then group the dictionary using unique "PI" values as 'keys' for grouping. Note: This dict-data should contain information about two adjacent haplotype blocks that needs extending. In this example I want to extend the haplotypes for "sample ms02g" which has two blocks 6 and 4. So, I read the PI and PG value for this sample. Also, data should store with some unique keys. B) Iterate over two consecutive Haplotype-Blocks at once. Note: While iterating over two blocks, initially we write the very first block of the "contig". With this method, now when we iterate over two consecutive blocks we can only update and write the second block. ''' # covert pandas dataframe back to text like file before converting it into dictionary. contigs_group = pd.DataFrame.to_csv(contigs_group, sep='\t', index=False, header=True) ''' Step 03 - A : read the data with header as keys and groupby using that "keys" ''' phased_dict = csv.DictReader(StringIO(contigs_group), delimiter='\t') phased_grouped = itertools.groupby(phased_dict, key=lambda x: x[soi + ':PI']) ''' Since the dictionary isn't ordered, we return the order using OrderedDictionary ''' # ** for future: there is room for improvement in here (memory and speed) grouped_data = collections.OrderedDict() for key, grp in phased_grouped: grouped_data[key] = accumulate(grp) ''' Clear memory ''' del phased_dict del phased_grouped del contigs_group #print() print(' - Starting MarkovChains for contig %s' % chr_) ''' Step 03 - B : now pipe the data for phase extension ''' ''' Step 03 - B : And, iterate over two consecutive Haplotype-Blocks at once. This is done to obtain all possible Haplotype configurations between two blocks. The (keys,values) for first block is represented as k1,v2 and for the later block as k2,v2. ''' ''' Step 03 - B (i): Before running consecutive blocks, we write data from the very first block to the file. Reason : Before we start computing and solving the haplotype phase state, we plan to write the data for very first block (k1, v1). So, after that, we can solve the relation between two consecutive.. .. blocks but only write data from 2nd block each time - based on what relation comes out. ''' very_first_block = [list(grouped_data.items())[0]] if len(list(grouped_data.items())) == 1: print('there is only one block, so skipping phase extension') # write header of the extended phase-block extended_haplotype = '\t'.join([ 'CHROM', 'POS', 'REF', 'all-alleles', soi + ':PI', soi + ':PG_al' ]) + '\n' if writelod == 'yes': # add extra field if desired by user extended_haplotype = extended_haplotype.rstrip('\n') + '\tlog2odds\n' log2odds = '' # write data/values from very first block. for k1, v1 in very_first_block: for r1, vals in enumerate(v1[soi + ':PI']): new_line = '\t'.join([ v1['CHROM'][r1], v1['POS'][r1], v1['REF'][r1], v1['all-alleles'][r1], v1[soi + ':PI'][r1], v1[soi + ':PG_al'][r1] ]) + '\n' if writelod == 'yes': new_line = new_line.rstrip('\n') + '\t.\n' extended_haplotype += new_line #print('very first block end\n\n') # marker for debugging ''' Step 03 - B (ii): Starting MarkovChains. Now, read data from two consecutive blocks at a time. Note: At the end of computation write the data only from each k2 block. No need to write the data from k1 block of each iteration because it was written in earlier loop.''' ''' Step 03 - B (ii - 1) Create empty "checker variables". Note: This checker variables (actually multi-level boolean logic) help to carryover information from ealier iteration of a for-loop - i.e identify if the values from later block i.e k2, v2 were phased to to earlier block (k1, v1) in "parallel" vs. "alternate configuration". - If two consecutive blocks are phased, k2_new is now assigned k1 from earlier block; else (if not phased) k2_new stays empty (''). - So, the role of flipped variable is to keep information if k2,v2 were phased straight vs. alternate compared to k1, v1 in the earlier run. These checker-variables are crucial to keep the proper phase-state in the output file.''' # start checker variables k2_new = '' # updates the index of k2 for each k1,v1 ; k2,v2 run flipped = '' # boolean logic to check and store if the phase state flipped during extension ''' Step 03 - B (ii - 2): Now, read two consecutive blocks at a time''' for (k1, v1), (k2, v2) in zip(grouped_data.items(), itertools.islice(grouped_data.items(), 1, None)): ''' Step 03 - B (ii - 2-A): iterate over the first Haplotype Block, i.e the k1 block. The nucleotides in the left of the phased SNPs are called Block01-haplotype-A, and similarly on the right as Block01-haplotype-B. ''' # iterate over the first Haplotype Block, i.e the k1 block and v1 values hap_block1a = [x.split('|')[0] for x in v1[soi + ':PG_al'] ] # the left haplotype of block01 hap_block1b = [x.split('|')[1] for x in v1[soi + ':PG_al']] # iterate over the second Haplotype Block, i.e the k2 block and v2 values hap_block2a = [x.split('|')[0] for x in v2[soi + ':PG_al']] hap_block2b = [x.split('|')[1] for x in v2[soi + ':PG_al']] ''' Step 03 - B (ii - 2-B) : Create possible haplotype configurations for "forward markov chain". Possible haplotype Configurations will be, Either : 1) Block01-haplotype-A phased with Block02-haplotype-A, creating -> hapb1a-hapb2a, hapb1b-hapb2b ''' ''' First possible configuration ''' hapb1a_hapb2a = [hap_block1a, hap_block2a] hapb1b_hapb2b = [hap_block1b, hap_block2b] ''' Or, Second Possible Configuration 2) block01-haplotype-A phased with Block02-haplotype-B creating -> hapb1a-hapb2b, hapb1b-hapb2a ''' hapb1a_hapb2b = [hap_block1a, hap_block2b] hapb1b_hapb2a = [hap_block1b, hap_block2a] ''' Step 03 - B (ii - 2-C) : Create possible haplotype configurations for "reverse markov chain" - reverse markov chain are added to increase the confidence in likelyhood estimation. ''' # switch the keys values for reverse markov chain k1_r = k2 k2_r = k1 v1_r = v2 v2_r = v1 # switch the haplotype positions for preparing the reverse markov chains hapb1a_hapb2a_r = [hapb1a_hapb2a[1], hapb1a_hapb2a[0]] hapb1b_hapb2b_r = [hapb1b_hapb2b[1], hapb1b_hapb2b[0]] hapb1a_hapb2b_r = [hapb1a_hapb2b[1], hapb1a_hapb2b[0]] hapb1b_hapb2a_r = [hapb1b_hapb2a[1], hapb1b_hapb2a[0]] ################################# - inactive for now- can be used for adding SNP phasing later on. ''' skip if one of the keys has no values - this is redundant ?? - keep it for just in case situation ** can also be used in the future if we want to phase the SNPs that have no assigned 'PI' values, i.e the "PI" will be "." ''' if k1 == '.' or k2 == '.': for xi in range(len(v2[soi + ':PI'])): new_line = '\t'.join([ v2['CHROM'][xi], v2['POS'][xi], v2['REF'][xi], v2['all-alleles'][xi], k2, hapb1a_hapb2a[1][xi] + '|' + hapb1b_hapb2b[1][xi] ]) + '\n' if writelod == 'yes': new_line = new_line.rstrip('\n') + '\t.\n' extended_haplotype += new_line # update the values of checker variables k2_new = '' flipped = '' continue # to next consecutive blocks ###################################################### ''' Step 03 - C : Set the threshold for the minimum number of SNPs required in haplotype block before continuing phase extension. ''' ''' If all the data in soi, in either v1 or v2 are SNPs below a certain threshold we just write the data and continue. i.e say if a Haplotype-Block is composed of only 2 SNPs it will be less reliable to extend the phase-state. - So, this step can also be used to control the minimum number/size of the haplotypes that is required before it can be phase-extended. - by default the minimum number of SNPs (exclusive) in the soi haplotype is set to 3. - If minimum requirement isn't met just skip extending the phase and write it to a file and continue. ''' number_of_snp_in_soi_v1 = len( [x for x in v1[soi + ':PG_al'] if len(x) == 3]) number_of_snp_in_soi_v2 = len( [x for x in v2[soi + ':PG_al'] if len(x) == 3]) # print('number of SNPs: ', NumSNPsInsoi_v1, NumSNPsInsoi_v2) if number_of_snp_in_soi_v1 < snp_threshold \ or number_of_snp_in_soi_v2 < snp_threshold: for xth, vals in enumerate(v2[soi + ':PI']): new_line = '\t'.join([ v2['CHROM'][xth], v2['POS'][xth], v2['REF'][xth], v2['all-alleles'][xth], k2, hapb1a_hapb2a[1][xth] + '|' + hapb1b_hapb2b[1][xth] ]) + '\n' if writelod == 'yes': new_line = new_line.rstrip('\n') + '\t.\n' extended_haplotype += new_line # update values of the checker variables # this is important, so previous k2 and flip state doesn't get carried over without purpose k2_new = '' flipped = '' continue # to next consecutive blocks ''' Step 04: For the consecutive blocks that pass the thresholds (SNP number, have PI != '.', etc., pipe the data (k1, v1 ; k2, v2) to a defined function for computation of forward and reverse markov chain transition probabilities for these two consecutive blocks (k1, v1; k2, v2) ''' #### for forward chain ######## # ** set "orientation=reversed" to compute transition .. # .. from the lower tip of former block with upper tip of later block # .. this helps in using the closest genomic position between consecutive blocks thus .. # .. downsizing the effects created by recombination. lhfc_f, lhsc_f = \ compute_maxLh_score(soi, sample_list, k1, k2, v1, v2, num_of_hets, hapb1a_hapb2a, hapb1b_hapb2b, hapb1a_hapb2b, hapb1b_hapb2a, maxed_as, orientation=reversed) #### for reverse chain ######## # set "orientation=lambda..." just passes a null value keeping orientation as it is. lhfc_r, lhsc_r = compute_maxLh_score \ (soi, sample_list, k1_r, k2_r, v1_r, v2_r, num_of_hets, hapb1a_hapb2a_r, hapb1b_hapb2b_r, hapb1a_hapb2b_r, hapb1b_hapb2a_r,maxed_as, orientation=lambda x: x) ''' Step 05-06 are inside the function "compute_maxLh_score()". The values (lhfc_f, lhsc_f, lhfc_r, lhsc_r) returned from this function is then used in Step 07. ''' ''' Step 07 : previous (Step 06) returns the likelyhoods and/or LODs score for both "parallel" and alternate configurations (for both forward and reverse algorithm). - We now extend the phase states by comparing LODs score against cutoff-values.''' ''' Step 07 - A(i): calculate the average of the likelyhoods, odds and then log2 of odds. ''' # average of the likelyhooods for first vs. second configuration # (from both forward and reverse algorithm) # ** note: "maxed_as" variable doesn't apply here, because maxLH using forward vs. reverse .. # .. are just re-estimates. So, we simply take and average on both "maxSum" and "maxPd" avg_lhfc = Decimal(lhfc_f + lhfc_r) / 2 avg_lhsc = Decimal(lhsc_f + lhsc_r) / 2 # therefore, odds of first_vs_second_configuration is odds_fc_vs_sc = avg_lhfc / avg_lhsc ''' Step 07 - A(ii) : convert the likelyhoods to odds-ratio and then logOf 2 odds''' lods2_score_1st_config = Decimal(odds_fc_vs_sc).ln() / ( Decimal('2').ln()) lods2_score_2nd_config = (-lods2_score_1st_config) #print('logOdds') # marker for debugging #print(lods2_score_1st_config) ''' Step 07 - B : pipe the LOD scores and write the phase state between two consecutive blocks. - use "lods cutoff" to decide on phase extension - and then store, write it to files. ** We can also use accumulation of this stage to run histogram building at later stage. - that acculated "extended_haplotype" can be all written at once - this is important while multiprocessing. ''' k2_new, flipped, extended_haplotype = extend_phase_state( soi, k1, k2, v1, v2, k2_new, flipped, lods2_score_1st_config, lods_cut_off, extended_haplotype, hapb1a_hapb2a, hapb1b_hapb2b, writelod) ''' Now, go to Step 08, function "extend_phase_state" ''' # this process udates the data in "extended_haplotype" recursively on the for-loop # finally return the extended haplotype as pandas dataframe phase_extend = extended_haplotype del extended_haplotype return pd.read_table(StringIO(phase_extend), sep='\t')