Beispiel #1
0
def compute_correction_terms(alignment_file, binary_raw_file):

    #initialise ccmpred object
    ccm = CCMpred()

    # specify possible file paths
    ccm.set_alignment_file(alignment_file)
    ccm.set_initraw_file(binary_raw_file)

    # read alignment and remove gapped sequences and positions
    ccm.read_alignment("psicov", 50, 75)

    # compute sequence weights (in order to reduce sampling bias)
    ccm.compute_sequence_weights("simple", 0.8)

    # compute amino acid counts and frequencies adding pseudo counts for non-observed amino acids
    ccm.compute_frequencies("uniform_pseudocounts", 1, 1)

    #read in binary raw file
    ccm.intialise_potentials()

    #compute apc
    ccm.recenter_potentials()
    cmat = contactmatrix.frobenius_score(ccm.x_pair)
    mean = np.mean(cmat, axis=0)
    apc_mat = mean[:, np.newaxis] * mean[np.newaxis, :] / np.mean(cmat)

    #compute entropy correction
    single_freq = ccm.pseudocounts.freqs[0]
    nr_states = 20
    log = np.log2
    scaling_factor, mat_corrected = contactmatrix.compute_local_correction(
        single_freq,
        ccm.x_pair,
        ccm.neff,
        1,
        squared=False,
        entropy=True,
        nr_states=nr_states,
        log=log)
    entropy_correction_mat = cmat - mat_corrected

    return apc_mat, entropy_correction_mat
def main():

    parser = argparse.ArgumentParser(description='Plotting a contact map.')

    group_append = parser.add_mutually_exclusive_group(required=True)
    group_append.add_argument('-m', '--mat-file', dest='mat_file', type=str, help='path to mat file')
    group_append.add_argument('-b', '--braw-file', dest='braw_file', type=str,help='path to binary raw coupling file')

    parser.add_argument('-o', '--plot-out', dest='plot_out', type=str, help='Output directory for plot')

    parser.add_argument('--seq-sep', dest='seqsep', type=int, default=6, help='Minimal sequence separation')
    parser.add_argument('--contact-threshold', dest='contact_threshold', type=int, default=8,  help='Contact definition as maximal C_beta distance between residue pairs.')
    parser.add_argument('--pdb-file', dest='pdb_file', type=str, help='Optional PDB file (renumbered starting from 1) for distance matrix.')
    parser.add_argument('--alignment-file', dest='alignment_file', type=str, help='Optional alignment file for gap percentage and entropy subplot.')
    parser.add_argument("--aln-format", dest="aln_format", default="psicov", help="File format for MSAs [default: \"%(default)s\"]")
    parser.add_argument("--apc", action="store_true", default=False, help="Apply average product correction")
    parser.add_argument("--entropy-correction", dest='entropy_correction', action="store_true", default=False, help="Apply entropy correction")

    args = parser.parse_args()


    if args.mat_file is None and args.braw_file is None:
        print("Either mat_file or braw_file need to be set.")

    mat_file    = args.mat_file
    braw_file   = args.braw_file
    alignment_file = args.alignment_file
    aln_format = args.aln_format
    pdb_file    = args.pdb_file
    plot_out    = args.plot_out
    seqsep      = args.seqsep
    contact_threshold = args.contact_threshold

    apc = args.apc
    entropy_correction = args.entropy_correction

    alignment=None
    if alignment_file is not None:
        alignment = read_msa(alignment_file, aln_format)

        #compute sequence weights
        weighting = SequenceWeights(False, 0.8)
        weights = weighting.weights_simple(alignment)

        #compute frequencies
        pseudocounts = PseudoCounts(alignment, weights)
        pseudocounts.calculate_frequencies(
            'uniform_pseudocounts', 1, 1, remove_gaps=False
        )

    if braw_file is not None:

        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta

        #compute frobenius score from couplings
        mat = io.frobenius_score(braw.x_pair)

        if entropy_correction:
            if alignment is None:
                print("Alignment file is necessary to compute entropy correction!")
            else:
                scaling_factor_eta, mat = io.compute_local_correction(
                    pseudocounts.freqs[0],
                    braw.x_pair,
                    meta_info['workflow'][0]['msafile']['neff'],
                    meta_info['workflow'][0]['regularization']['lambda_pair'],
                    mat,
                    squared=False, entropy=True
                )
        elif apc:
            mat = io.apc(mat)

    if mat_file is not None:
        mat, meta_info = io.read_matrix(mat_file)

        if entropy_correction:
            print("Binary Raw file is necessary to compute entropy correction!")
        elif apc:
            mat = io.apc(mat)


    plot_file = plot_out + "/contact_map_seqsep{0}_contacthr{1}.html".format(seqsep, contact_threshold)
    plot_contact_map(mat, seqsep, contact_threshold, plot_file, "", alignment=alignment, pdb_file=pdb_file)
Beispiel #3
0
def plot_contact_map(alignment_file, aln_format, braw_file, mat_file, pdb_file, plot_file,
                     entropy_correction, apc, seqsep, contact_threshold):

    pseudocounts = None
    mat = None
    gaps_percentage_plot = None
    protein = None


    if entropy_correction and (alignment_file is None or braw_file is None):
        print("Entropy correction requires specification of alignment file and binary raw couplign file!")
        sys.exit(1)

    if alignment_file is not None:
        protein = os.path.basename(alignment_file).split(".")[0]
        alignment = io.read_msa(alignment_file, aln_format)

        # compute sequence weights
        weights = ccmpred.weighting.weights_simple(alignment, 0.8)

        # compute frequencies
        pseudocounts = PseudoCounts(alignment, weights)
        pseudocounts.calculate_frequencies(
            'uniform_pseudocounts', 1, 1, remove_gaps=False
        )

        gaps_percentage_plot = plot.plot_percentage_gaps_per_position(pseudocounts.counts[0], plot_file=None)

    if braw_file is not None:

        protein = os.path.basename(braw_file).split(".")[0]

        braw = raw.parse_msgpack(braw_file)
        meta_info = braw.meta

        # compute frobenius score from couplings
        mat = io_cm.frobenius_score(braw.x_pair)

        if entropy_correction:

            scaling_factor_eta, mat = io_cm.compute_local_correction(
                pseudocounts.freqs[0],
                braw.x_pair,
                meta_info['workflow'][0]['msafile']['neff'],
                meta_info['workflow'][0]['regularization']['lambda_pair'],
                mat,
                entropy=True
            )
        elif apc:
            mat = io_cm.apc(mat)

    if mat_file is not None:

        protein = os.path.basename(mat_file).split(".")[0]

        mat, meta_info = io_cm.read_matrix(mat_file)

        if apc:
            mat = io_cm.apc(mat)

    L = len(mat)
    indices_upper_tri_i, indices_upper_tri_j = np.triu_indices(L, seqsep)

    plot_matrix = pd.DataFrame()
    plot_matrix['residue_i'] = indices_upper_tri_i + 1
    plot_matrix['residue_j'] = indices_upper_tri_j + 1
    plot_matrix['confidence'] = mat[indices_upper_tri_i, indices_upper_tri_j]

    if pdb_file is not None:
        # compute distance map from pdb file
        observed_distances = io.distance_map(pdb_file, L)
        plot_matrix['distance'] = observed_distances[indices_upper_tri_i, indices_upper_tri_j]
        plot_matrix['contact'] = ((plot_matrix.distance < contact_threshold) * 1).tolist()


    plot_title="Contact Map for protein {0}".format(protein)

    # Plot Contact Map
    plot.plot_contact_map_someScore_plotly(plot_matrix, plot_title, seqsep, gaps_percentage_plot, plot_file)