예제 #1
0
    def intialise_potentials(self):

        if self.init_raw_file is not None:

            try:
                raw_potentials = raw.parse_msgpack(self.init_raw_file)
            except:
                print("Unexpected error whil reading binary raw file {0}: {1}".
                      format(self.init_raw_file,
                             sys.exc_info()[0]))
                sys.exit(0)

            print("\nSuccessfully loaded model parameters from {0}.".format(
                self.init_raw_file))
            self.x_single, self.x_pair = raw_potentials.x_single, raw_potentials.x_pair

            #in case positions with many gaps should be removed
            if self.gapped_positions is not None:
                indices = [
                    i for i in range(raw_potentials.ncol)
                    if i not in self.gapped_positions
                ]
                self.x_single = self.x_single[indices, :]
                self.x_pair = self.x_pair[indices, :, :, :]
                self.x_pair = self.x_pair[:, indices, :, :]
                print(
                    "Removed parameters for positions with >{0}% gaps.".format(
                        self.max_gap_pos))

            #save setting for meta data
            self.single_potential_init = self.init_raw_file
            self.pair_potential_init = self.init_raw_file

        else:
            # default initialisation of parameters:
            # initialise single potentials from regularization prior
            self.x_single = self.regularization.center_x_single

            # initialise pair potnetials at zero
            self.x_pair = np.zeros((self.L, self.L, 21, 21))

            # save settting for meta data
            self.single_potential_init = self.reg_type
            self.pair_potential_init = "zero"
def main():

    parser = argparse.ArgumentParser(description='Plotting a contact map.')

    group_append = parser.add_mutually_exclusive_group(required=True)
    group_append.add_argument('-m', '--mat-file', dest='mat_file', type=str, help='path to mat file')
    group_append.add_argument('-b', '--braw-file', dest='braw_file', type=str,help='path to binary raw coupling file')

    parser.add_argument('-o', '--plot-out', dest='plot_out', type=str, help='Output directory for plot')

    parser.add_argument('--seq-sep', dest='seqsep', type=int, default=6, help='Minimal sequence separation')
    parser.add_argument('--contact-threshold', dest='contact_threshold', type=int, default=8,  help='Contact definition as maximal C_beta distance between residue pairs.')
    parser.add_argument('--pdb-file', dest='pdb_file', type=str, help='Optional PDB file (renumbered starting from 1) for distance matrix.')
    parser.add_argument('--alignment-file', dest='alignment_file', type=str, help='Optional alignment file for gap percentage and entropy subplot.')
    parser.add_argument("--aln-format", dest="aln_format", default="psicov", help="File format for MSAs [default: \"%(default)s\"]")
    parser.add_argument("--apc", action="store_true", default=False, help="Apply average product correction")
    parser.add_argument("--entropy-correction", dest='entropy_correction', action="store_true", default=False, help="Apply entropy correction")

    args = parser.parse_args()


    if args.mat_file is None and args.braw_file is None:
        print("Either mat_file or braw_file need to be set.")

    mat_file    = args.mat_file
    braw_file   = args.braw_file
    alignment_file = args.alignment_file
    aln_format = args.aln_format
    pdb_file    = args.pdb_file
    plot_out    = args.plot_out
    seqsep      = args.seqsep
    contact_threshold = args.contact_threshold

    apc = args.apc
    entropy_correction = args.entropy_correction

    alignment=None
    if alignment_file is not None:
        alignment = read_msa(alignment_file, aln_format)

        #compute sequence weights
        weighting = SequenceWeights(False, 0.8)
        weights = weighting.weights_simple(alignment)

        #compute frequencies
        pseudocounts = PseudoCounts(alignment, weights)
        pseudocounts.calculate_frequencies(
            'uniform_pseudocounts', 1, 1, remove_gaps=False
        )

    if braw_file is not None:

        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta

        #compute frobenius score from couplings
        mat = io.frobenius_score(braw.x_pair)

        if entropy_correction:
            if alignment is None:
                print("Alignment file is necessary to compute entropy correction!")
            else:
                scaling_factor_eta, mat = io.compute_local_correction(
                    pseudocounts.freqs[0],
                    braw.x_pair,
                    meta_info['workflow'][0]['msafile']['neff'],
                    meta_info['workflow'][0]['regularization']['lambda_pair'],
                    mat,
                    squared=False, entropy=True
                )
        elif apc:
            mat = io.apc(mat)

    if mat_file is not None:
        mat, meta_info = io.read_matrix(mat_file)

        if entropy_correction:
            print("Binary Raw file is necessary to compute entropy correction!")
        elif apc:
            mat = io.apc(mat)


    plot_file = plot_out + "/contact_map_seqsep{0}_contacthr{1}.html".format(seqsep, contact_threshold)
    plot_contact_map(mat, seqsep, contact_threshold, plot_file, "", alignment=alignment, pdb_file=pdb_file)
예제 #3
0
def plot_contact_map(alignment_file, aln_format, braw_file, mat_file, pdb_file, plot_file,
                     entropy_correction, apc, seqsep, contact_threshold):

    pseudocounts = None
    mat = None
    gaps_percentage_plot = None
    protein = None


    if entropy_correction and (alignment_file is None or braw_file is None):
        print("Entropy correction requires specification of alignment file and binary raw couplign file!")
        sys.exit(1)

    if alignment_file is not None:
        protein = os.path.basename(alignment_file).split(".")[0]
        alignment = io.read_msa(alignment_file, aln_format)

        # compute sequence weights
        weights = ccmpred.weighting.weights_simple(alignment, 0.8)

        # compute frequencies
        pseudocounts = PseudoCounts(alignment, weights)
        pseudocounts.calculate_frequencies(
            'uniform_pseudocounts', 1, 1, remove_gaps=False
        )

        gaps_percentage_plot = plot.plot_percentage_gaps_per_position(pseudocounts.counts[0], plot_file=None)

    if braw_file is not None:

        protein = os.path.basename(braw_file).split(".")[0]

        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta

        # compute frobenius score from couplings
        mat = io_cm.frobenius_score(braw.x_pair)

        if entropy_correction:

            scaling_factor_eta, mat = io_cm.compute_local_correction(
                pseudocounts.freqs[0],
                braw.x_pair,
                meta_info['workflow'][0]['msafile']['neff'],
                meta_info['workflow'][0]['regularization']['lambda_pair'],
                mat,
                entropy=True
            )
        elif apc:
            mat = io_cm.apc(mat)

    if mat_file is not None:

        protein = os.path.basename(mat_file).split(".")[0]

        mat, meta_info = io_cm.read_matrix(mat_file)

        if apc:
            mat = io_cm.apc(mat)

    L = len(mat)
    indices_upper_tri_i, indices_upper_tri_j = np.triu_indices(L, seqsep)

    plot_matrix = pd.DataFrame()
    plot_matrix['residue_i'] = indices_upper_tri_i + 1
    plot_matrix['residue_j'] = indices_upper_tri_j + 1
    plot_matrix['confidence'] = mat[indices_upper_tri_i, indices_upper_tri_j]

    if pdb_file is not None:
        # compute distance map from pdb file
        observed_distances = io.distance_map(pdb_file, L)
        plot_matrix['distance'] = observed_distances[indices_upper_tri_i, indices_upper_tri_j]
        plot_matrix['contact'] = ((plot_matrix.distance < contact_threshold) * 1).tolist()


    plot_title="Contact Map for protein {0}".format(protein)

    # Plot Contact Map
    plot.plot_contact_map_someScore_plotly(plot_matrix, plot_title, seqsep, gaps_percentage_plot, plot_file)
def collect_data(braw_dir, psicov_dir, pdb_dir, sequence_separation, cb_lower, cb_upper, nr_residue_pairs,
             diversity_threshold, Nij_threshold, l2normapc_threshold):

    braw_files = glob.glob(braw_dir + "/*braw.gz")

    # data
    coupling_data = pd.DataFrame()
    for braw_file in braw_files:
        # braw_file = braw_files[1]

        protein = os.path.basename(braw_file).split(".")[0]
        print(protein)
        alignment_file = psicov_dir + protein + '.aln'
        if not os.path.exists(alignment_file):
            print("Alignment File {0} does not exist.".format(alignment_file))
            continue

        pdb_file = pdb_dir + protein + '.pdb'
        if not os.path.exists(pdb_file):
            print("PDB File {0} does not exist.".format(pdb_file))
            continue



        AF = AlignmentFeatures(alignment_file, sequence_separation, 8, 8)

        diversity = np.sqrt(AF.N) / AF.L
        if diversity < diversity_threshold:
            print("Diversity = {0}. Skip this protein.".format(diversity))
            continue

        braw = raw.parse_msgpack(braw_file)
        distance_map = pdb.distance_map(pdb_file, AF.L)

        #mask highly gapped positions
        gaps = 1 - (AF.Ni / AF.neff)
        highly_gapped_pos = np.where(np.array(gaps) > 0.5)[0]
        distance_map[highly_gapped_pos, :] = np.nan
        distance_map[:, highly_gapped_pos] = np.nan

        #get all residue pairs i<j
        residue_i, residue_j = np.triu_indices(AF.L, k=sequence_separation)

        # get residue pairs within Cb range
        dist_ij_pairs = distance_map[residue_i, residue_j]
        residue_i = residue_i[(dist_ij_pairs > cb_lower) & (dist_ij_pairs < cb_upper)]
        residue_j = residue_j[(dist_ij_pairs > cb_lower) & (dist_ij_pairs < cb_upper)]

        if len(residue_i) == 0:
            print("No residues left after applying distance constraints.")
            continue

        #apply Nij_treshold
        Nij = AF.Nij[residue_i, residue_j]
        residue_i = residue_i[(Nij > Nij_threshold)]
        residue_j = residue_j[(Nij > Nij_threshold)]

        if len(residue_i) == 0:
            print("No residues left after applying pairwise counts constraints.")
            continue

        # compute l2norm_apc score that has mean=0
        l2norm_apc = bu.compute_l2norm_from_braw(braw.x_pair, apc=True)
        l2norm_apc_ij_pairs = l2norm_apc[residue_i, residue_j]
        residue_i = residue_i[(l2norm_apc_ij_pairs > l2normapc_threshold)]
        residue_j = residue_j[(l2norm_apc_ij_pairs > l2normapc_threshold)]

        if len(residue_i) == 0:
            print("No residues left after applying APC threshold constraints.")
            continue

        protein_coupling_df = pd.DataFrame(
            braw.x_pair[residue_i, residue_j, :20, :20].reshape(len(residue_i), 400),
            columns=io.AB)


        # -----------------------------------------------------------------------------------
        # for reproducibility: set all values between -0.005 and 0.01  to zero
        # ind = (protein_coupling_df.loc[:, :] > -0.005) & (protein_coupling_df.loc[:, :] < 0.01)
        # protein_coupling_df[ind] = 0
        # -----------------------------------------------------------------------------------
        coupling_data = coupling_data.append(protein_coupling_df)

        print("Dataset size: " + str(len(coupling_data)))
        sys.stdout.flush()
        if len(coupling_data) > nr_residue_pairs:
            break

    print("final dataset size: " + str(len(coupling_data)))
    coupling_data.reset_index(inplace=True, drop=True)

    return coupling_data