Ejemplo n.º 1
0
def make_crops(seq_file):
    target_line, *seq_line = seq_file.read_text().split('\n')
    target = seq_file.stem
    suffix = seq_file.suffix
    target_seq = ''.join(seq_line)

    for domain in utils.generate_domains(target, target_seq):
        name = domain['name']
        if name == target: continue
        crop_start, crop_end = domain["description"]
        seq = target_seq[crop_start - 1:crop_end]
        (seq_file.parent / f'{name}{suffix}').write_text(f'>{name}\n{seq}')
def ensemble(target_path, out_dir):
    for model_dir in filter(lambda d: d.is_dir() and d.name != 'pasted',
                            out_dir.iterdir()):
        r = {}
        for replica_dir in filter(lambda d: d.is_dir() and d.name.isdigit(),
                                  model_dir.iterdir()):
            for pkl in replica_dir.glob('*.distance'):
                target = pkl.name.split('.')[0]
                dis = np.load(pkl, allow_pickle=True)

                if target in r:
                    r[target].append(dis)
                else:
                    r[target] = [dis]

        ensemble_dir = model_dir / 'ensemble'
        ensemble_dir.mkdir(exist_ok=True)
        for k, v in r.items():
            ensemble_file = ensemble_dir / f'{k}.distance'
            ensemble_dis = sum(v) / len(v)
            ensemble_dis.dump(ensemble_file)

    targets_weight = {
        data['domain_name']: {
            'weight': data['num_alignments'][0, 0],
            'seq': data['sequence']
        }
        for data in np.load(target_path, allow_pickle=True)
    }
    ensemble_dir = out_dir / 'Distogram' / 'ensemble'
    paste_dir = out_dir / 'pasted'
    paste_dir.mkdir(exist_ok=True)
    targets = set([t.split("-")[0] for t in targets_weight.keys()])

    for target in targets:
        combined_cmap = np.load(ensemble_dir / f'{target}.distance',
                                allow_pickle=True)
        counter_map = np.ones_like(combined_cmap[:, :, 0:1])
        seq = targets_weight[target]['seq']
        target_domains = utils.generate_domains(target, seq)

        for domain in sorted(target_domains, key=lambda x: x["name"]):
            if domain["name"] == target: continue

            crop_start, crop_end = domain["description"]
            domain_dis = np.load(ensemble_dir / f'{domain["name"]}.distance',
                                 allow_pickle=True)
            weight = targets_weight[domain["name"]]['weight']
            weight_matrix_size = crop_end - crop_start + 1
            weight_matrix = np.ones((weight_matrix_size, weight_matrix_size),
                                    dtype=np.float32) * weight
            combined_cmap[crop_start - 1:crop_end, crop_start -
                          1:crop_end, :] += (domain_dis *
                                             np.expand_dims(weight_matrix, 2))
            counter_map[crop_start - 1:crop_end, crop_start - 1:crop_end,
                        0] += weight_matrix

        combined_cmap /= counter_map
        combined_cmap.dump(paste_dir / f'{target}.distance')
        contact_probs = combined_cmap[:, :, :19].sum(-1)
        utils.save_rr_file(contact_probs, seq, target,
                           paste_dir / f'{target}.rr')
        utils.plot_contact_map(target, [contact_probs, combined_cmap],
                               paste_dir / f'{target}.png')
Ejemplo n.º 3
0
def feature_generation(seq_file, out_file):
    target_line, *seq_line = seq_file.read_text().split('\n')
    target = seq_file.stem
    target_seq = ''.join(seq_line)
    data_dir = seq_file.parent
    dataset = []

    for domain in utils.generate_domains(target, target_seq):
        name = domain['name']
        crop_start, crop_end = domain["description"]
        seq = target_seq[crop_start - 1:crop_end]
        L = len(seq)
        hhm_file = data_dir / f'{name}.hhm'
        fas_file = data_dir / f'{name}.fas'
        aln_file = data_dir / f'{name}.aln'
        mat_file = data_dir / f'{name}.mat'

        if aln_file.exists():
            aln, _ = read_aln(aln_file)
        else:
            aln, aln_id = read_aln(fas_file)
            aln = aln[:, aln[0] != '-']
            write_aln(aln, aln_id, aln_file)
            exit()

        if mat_file.exists():
            mat = sio.loadmat(mat_file)
            pseudo_bias = np.float32(mat['pseudo_bias'])
            pseudo_frob = np.float32(np.expand_dims(mat['pseudo_frob'], -1))
            pseudolikelihood = np.float32(mat['pseudolikelihood'])
        else:
            pseudo_bias = np.zeros((L, 22), dtype=np.float32)
            pseudo_frob = np.zeros((L, L, 1), dtype=np.float32)
            pseudolikelihood = np.zeros((L, L, 484), dtype=np.float32)

        gap_count = np.float32(aln == '-')
        gap_matrix = np.expand_dims(
            np.matmul(gap_count.T, gap_count) / aln.shape[0], -1)

        mapping = {aa: i for i, aa in enumerate('ARNDCQEGHILKMFPSTWYVX-')}
        seq_weight = sequence_weights(aln)
        hhblits_profile = np.zeros((L, 22), dtype=np.float32)
        reweighted_profile = np.zeros((L, 22), dtype=np.float32)
        for i in range(L):
            for j in range(aln.shape[0]):
                hhblits_profile[i, mapping[aln[j, i]]] += 1
                reweighted_profile[i, mapping[aln[j, i]]] += seq_weight[j]
        hhblits_profile /= hhblits_profile.sum(-1).reshape(-1, 1)
        reweighted_profile /= reweighted_profile.sum(-1).reshape(-1, 1)

        mapping = {aa: i for i, aa in enumerate('ARNDCQEGHILKMFPSTWYV-')}
        non_gapped_profile = np.zeros((L, 21), dtype=np.float32)
        for i in range(L):
            for j in aln[:, i]:

                if j == 'X':  #TR
                    j = 'A'  #TR
                non_gapped_profile[i, mapping[j]] += 1
        non_gapped_profile[:, -1] = 0
        non_gapped_profile /= non_gapped_profile.sum(-1).reshape(-1, 1)

        mapping = {aa: i for i, aa in enumerate('-ARNDCQEGHILKMFPSTWYVX')}

        a2n = np.frompyfunc(lambda x: mapping[x], 1, 1)
        fi, fij, Meff = calculate_f(a2n(aln))
        MI = calculate_MI(fi, fij)

        data = {
            'chain_name': target,
            'domain_name': name,
            'sequence': seq,
            'seq_length': np.ones((L, 1), dtype=np.int64) * L,
            'residue_index': np.arange(L, dtype=np.int64).reshape(L, 1),
            'aatype': sequence_to_onehot(seq),
            # profile: A profile (probability distribution over amino acid types)
            # computed using PSI-BLAST. Equivalent to the output of ChkParse.
            'hhblits_profile': hhblits_profile,
            'reweighted_profile': reweighted_profile,
            'hmm_profile': extract_hmm_profile(hhm_file.read_text(), seq),
            'num_alignments': np.ones((L, 1), dtype=np.int64) * aln.shape[0],
            'deletion_probability':
            np.float32(aln == '-').mean(0).reshape(-1, 1),
            'gap_matrix': gap_matrix,
            'non_gapped_profile': non_gapped_profile,
            # plmDCA
            'pseudo_frob': pseudo_frob,
            'pseudo_bias': pseudo_bias,
            'pseudolikelihood': pseudolikelihood,
            'num_effective_alignments': np.float32(Meff),
            'mutual_information': MI,
            # no need features for prediction
            'resolution': np.float32(0),
            'sec_structure': np.zeros((L, 8), dtype=np.int64),
            'sec_structure_mask': np.zeros((L, 1), dtype=np.int64),
            'solv_surf': np.zeros((L, 1), dtype=np.float32),
            'solv_surf_mask': np.zeros((L, 1), dtype=np.int64),
            'alpha_positions': np.zeros((L, 3), dtype=np.float32),
            'alpha_mask': np.zeros((L, 1), dtype=np.int64),
            'beta_positions': np.zeros((L, 3), dtype=np.float32),
            'beta_mask': np.zeros((L, 1), dtype=np.int64),
            'superfamily': '',
            'between_segment_residues': np.zeros((L, 1), dtype=np.int64),
            'phi_angles': np.zeros((L, 1), dtype=np.float32),
            'phi_mask': np.zeros((L, 1), dtype=np.int64),
            'psi_angles': np.zeros((L, 1), dtype=np.float32),
            'psi_mask': np.zeros((L, 1), dtype=np.int64),
            # to be fixed soon
            'profile': np.zeros((L, 21), dtype=np.float32),
            'profile_with_prior': np.zeros((L, 22), dtype=np.float32),
            'profile_with_prior_without_gaps': np.zeros((L, 21),
                                                        dtype=np.float32)
        }
        dataset.append(data)

    np.save(out_file, dataset, allow_pickle=True)