def make_crops(seq_file): target_line, *seq_line = seq_file.read_text().split('\n') target = seq_file.stem suffix = seq_file.suffix target_seq = ''.join(seq_line) for domain in utils.generate_domains(target, target_seq): name = domain['name'] if name == target: continue crop_start, crop_end = domain["description"] seq = target_seq[crop_start - 1:crop_end] (seq_file.parent / f'{name}{suffix}').write_text(f'>{name}\n{seq}')
def ensemble(target_path, out_dir): for model_dir in filter(lambda d: d.is_dir() and d.name != 'pasted', out_dir.iterdir()): r = {} for replica_dir in filter(lambda d: d.is_dir() and d.name.isdigit(), model_dir.iterdir()): for pkl in replica_dir.glob('*.distance'): target = pkl.name.split('.')[0] dis = np.load(pkl, allow_pickle=True) if target in r: r[target].append(dis) else: r[target] = [dis] ensemble_dir = model_dir / 'ensemble' ensemble_dir.mkdir(exist_ok=True) for k, v in r.items(): ensemble_file = ensemble_dir / f'{k}.distance' ensemble_dis = sum(v) / len(v) ensemble_dis.dump(ensemble_file) targets_weight = { data['domain_name']: { 'weight': data['num_alignments'][0, 0], 'seq': data['sequence'] } for data in np.load(target_path, allow_pickle=True) } ensemble_dir = out_dir / 'Distogram' / 'ensemble' paste_dir = out_dir / 'pasted' paste_dir.mkdir(exist_ok=True) targets = set([t.split("-")[0] for t in targets_weight.keys()]) for target in targets: combined_cmap = np.load(ensemble_dir / f'{target}.distance', allow_pickle=True) counter_map = np.ones_like(combined_cmap[:, :, 0:1]) seq = targets_weight[target]['seq'] target_domains = utils.generate_domains(target, seq) for domain in sorted(target_domains, key=lambda x: x["name"]): if domain["name"] == target: continue crop_start, crop_end = domain["description"] domain_dis = np.load(ensemble_dir / f'{domain["name"]}.distance', allow_pickle=True) weight = targets_weight[domain["name"]]['weight'] weight_matrix_size = crop_end - crop_start + 1 weight_matrix = np.ones((weight_matrix_size, weight_matrix_size), dtype=np.float32) * weight combined_cmap[crop_start - 1:crop_end, crop_start - 1:crop_end, :] += (domain_dis * np.expand_dims(weight_matrix, 2)) counter_map[crop_start - 1:crop_end, crop_start - 1:crop_end, 0] += weight_matrix combined_cmap /= counter_map combined_cmap.dump(paste_dir / f'{target}.distance') contact_probs = combined_cmap[:, :, :19].sum(-1) utils.save_rr_file(contact_probs, seq, target, paste_dir / f'{target}.rr') utils.plot_contact_map(target, [contact_probs, combined_cmap], paste_dir / f'{target}.png')
def feature_generation(seq_file, out_file): target_line, *seq_line = seq_file.read_text().split('\n') target = seq_file.stem target_seq = ''.join(seq_line) data_dir = seq_file.parent dataset = [] for domain in utils.generate_domains(target, target_seq): name = domain['name'] crop_start, crop_end = domain["description"] seq = target_seq[crop_start - 1:crop_end] L = len(seq) hhm_file = data_dir / f'{name}.hhm' fas_file = data_dir / f'{name}.fas' aln_file = data_dir / f'{name}.aln' mat_file = data_dir / f'{name}.mat' if aln_file.exists(): aln, _ = read_aln(aln_file) else: aln, aln_id = read_aln(fas_file) aln = aln[:, aln[0] != '-'] write_aln(aln, aln_id, aln_file) exit() if mat_file.exists(): mat = sio.loadmat(mat_file) pseudo_bias = np.float32(mat['pseudo_bias']) pseudo_frob = np.float32(np.expand_dims(mat['pseudo_frob'], -1)) pseudolikelihood = np.float32(mat['pseudolikelihood']) else: pseudo_bias = np.zeros((L, 22), dtype=np.float32) pseudo_frob = np.zeros((L, L, 1), dtype=np.float32) pseudolikelihood = np.zeros((L, L, 484), dtype=np.float32) gap_count = np.float32(aln == '-') gap_matrix = np.expand_dims( np.matmul(gap_count.T, gap_count) / aln.shape[0], -1) mapping = {aa: i for i, aa in enumerate('ARNDCQEGHILKMFPSTWYVX-')} seq_weight = sequence_weights(aln) hhblits_profile = np.zeros((L, 22), dtype=np.float32) reweighted_profile = np.zeros((L, 22), dtype=np.float32) for i in range(L): for j in range(aln.shape[0]): hhblits_profile[i, mapping[aln[j, i]]] += 1 reweighted_profile[i, mapping[aln[j, i]]] += seq_weight[j] hhblits_profile /= hhblits_profile.sum(-1).reshape(-1, 1) reweighted_profile /= reweighted_profile.sum(-1).reshape(-1, 1) mapping = {aa: i for i, aa in enumerate('ARNDCQEGHILKMFPSTWYV-')} non_gapped_profile = np.zeros((L, 21), dtype=np.float32) for i in range(L): for j in aln[:, i]: if j == 'X': #TR j = 'A' #TR non_gapped_profile[i, mapping[j]] += 1 non_gapped_profile[:, -1] = 0 non_gapped_profile /= non_gapped_profile.sum(-1).reshape(-1, 1) mapping = {aa: i for i, aa in enumerate('-ARNDCQEGHILKMFPSTWYVX')} a2n = np.frompyfunc(lambda x: mapping[x], 1, 1) fi, fij, Meff = calculate_f(a2n(aln)) MI = calculate_MI(fi, fij) data = { 'chain_name': target, 'domain_name': name, 'sequence': seq, 'seq_length': np.ones((L, 1), dtype=np.int64) * L, 'residue_index': np.arange(L, dtype=np.int64).reshape(L, 1), 'aatype': sequence_to_onehot(seq), # profile: A profile (probability distribution over amino acid types) # computed using PSI-BLAST. Equivalent to the output of ChkParse. 'hhblits_profile': hhblits_profile, 'reweighted_profile': reweighted_profile, 'hmm_profile': extract_hmm_profile(hhm_file.read_text(), seq), 'num_alignments': np.ones((L, 1), dtype=np.int64) * aln.shape[0], 'deletion_probability': np.float32(aln == '-').mean(0).reshape(-1, 1), 'gap_matrix': gap_matrix, 'non_gapped_profile': non_gapped_profile, # plmDCA 'pseudo_frob': pseudo_frob, 'pseudo_bias': pseudo_bias, 'pseudolikelihood': pseudolikelihood, 'num_effective_alignments': np.float32(Meff), 'mutual_information': MI, # no need features for prediction 'resolution': np.float32(0), 'sec_structure': np.zeros((L, 8), dtype=np.int64), 'sec_structure_mask': np.zeros((L, 1), dtype=np.int64), 'solv_surf': np.zeros((L, 1), dtype=np.float32), 'solv_surf_mask': np.zeros((L, 1), dtype=np.int64), 'alpha_positions': np.zeros((L, 3), dtype=np.float32), 'alpha_mask': np.zeros((L, 1), dtype=np.int64), 'beta_positions': np.zeros((L, 3), dtype=np.float32), 'beta_mask': np.zeros((L, 1), dtype=np.int64), 'superfamily': '', 'between_segment_residues': np.zeros((L, 1), dtype=np.int64), 'phi_angles': np.zeros((L, 1), dtype=np.float32), 'phi_mask': np.zeros((L, 1), dtype=np.int64), 'psi_angles': np.zeros((L, 1), dtype=np.float32), 'psi_mask': np.zeros((L, 1), dtype=np.int64), # to be fixed soon 'profile': np.zeros((L, 21), dtype=np.float32), 'profile_with_prior': np.zeros((L, 22), dtype=np.float32), 'profile_with_prior_without_gaps': np.zeros((L, 21), dtype=np.float32) } dataset.append(data) np.save(out_file, dataset, allow_pickle=True)