Example #1
0
def compute_correction_terms(alignment_file, binary_raw_file):

    #initialise ccmpred object
    ccm = CCMpred()

    # specify possible file paths
    ccm.set_alignment_file(alignment_file)
    ccm.set_initraw_file(binary_raw_file)

    # read alignment and remove gapped sequences and positions
    ccm.read_alignment("psicov", 50, 75)

    # compute sequence weights (in order to reduce sampling bias)
    ccm.compute_sequence_weights("simple", 0.8)

    # compute amino acid counts and frequencies adding pseudo counts for non-observed amino acids
    ccm.compute_frequencies("uniform_pseudocounts", 1, 1)

    #read in binary raw file
    ccm.intialise_potentials()

    #compute apc
    ccm.recenter_potentials()
    cmat = contactmatrix.frobenius_score(ccm.x_pair)
    mean = np.mean(cmat, axis=0)
    apc_mat = mean[:, np.newaxis] * mean[np.newaxis, :] / np.mean(cmat)

    #compute entropy correction
    single_freq = ccm.pseudocounts.freqs[0]
    nr_states = 20
    log = np.log2
    scaling_factor, mat_corrected = contactmatrix.compute_local_correction(
        single_freq,
        ccm.x_pair,
        ccm.neff,
        1,
        squared=False,
        entropy=True,
        nr_states=nr_states,
        log=log)
    entropy_correction_mat = cmat - mat_corrected

    return apc_mat, entropy_correction_mat
Example #2
0
def main():

    # read command line options
    opt = parse_args()

    # print logo
    if opt.logo:
        ccmpred.logo.logo()

    # set OMP environment variable for number of threads
    os.environ['OMP_NUM_THREADS'] = str(opt.num_threads)
    print("Using {0} threads for OMP parallelization.".format(
        os.environ["OMP_NUM_THREADS"]))

    # instantiate CCMpred
    ccm = CCMpred()

    # specify possible file paths
    ccm.set_alignment_file(opt.alnfile)
    ccm.set_matfile(opt.matfile)
    ccm.set_pdb_file(opt.pdbfile)
    ccm.set_initraw_file(opt.initrawfile)

    # read alignment and possible remove gapped sequences and positions
    ccm.read_alignment(opt.aln_format, opt.max_gap_pos, opt.max_gap_seq)

    # compute sequence weights (in order to reduce sampling bias)
    ccm.compute_sequence_weights(opt.weight, opt.wt_cutoff)

    # compute amino acid counts and frequencies adding pseudo counts for non-observed amino acids
    ccm.compute_frequencies(opt.pseudocounts, opt.pseudocount_single,
                            opt.pseudocount_pair)

    # read pdb file if CCMpred is setup as a constrained run
    if opt.pdbfile:
        ccm.read_pdb(opt.contact_threshold)

    # if alternative scores are specified: compute these and exit
    if opt.omes:
        ccm.compute_omes(opt.omes_fodoraldrich)
        ccm.write_matrix()
        sys.exit(0)

    if opt.mi:
        ccm.compute_mutual_info(opt.mi_normalized, opt.mi_pseudocounts)
        ccm.write_matrix()
        sys.exit(0)

    # setup L2 regularization
    ccm.specify_regularization(opt.lambda_single,
                               opt.lambda_pair_factor,
                               reg_type="L2",
                               scaling="L",
                               single_prior=opt.single_prior)

    # intialise single and pair potentials either:
    #   - according to regularization priors
    #   - from initrawfile (accounting for removal of many gapped positions, if applicable)
    ccm.intialise_potentials()

    # optimize objective function (pLL or CD/PCD) with optimization algorithm (LBFGS, CG, GD or ADAM)
    if opt.optimize:

        #initialize log object
        ccm.initiate_logging(opt.plot_opt_progress)

        #minimize objective function with corresponding optimization algorithm
        ccm.minimize(opt)
    else:
        print("\nDo not optimize but use model parameters provided by {0}\n".
              format(opt.initrawfile))

    ### Post Processing

    #specify meta data, and write (corrected) contact matrices to files
    if opt.matfile:

        # Compute contact score (frobenius norm) by recentering potentials
        # TODO: other scores can be added ...
        ccm.compute_contact_matrix(recenter_potentials=True, frob=True)

        # compute corrected contact maps (removing entropy/phylogenetic biases)
        # TODO: other corrections can be added ...
        ccm.compute_correction(
            apc_file=opt.apc_file,
            entropy_correction_file=opt.entropy_correction_file)

        ccm.write_matrix()

    # write model parameters in binary format
    if opt.out_binary_raw_file:
        ccm.write_binary_raw(opt.out_binary_raw_file)

    exitcode = 0
    if opt.optimize:
        if ccm.algret['code'] < 0:
            exitcode = -ccm.algret['code']
    sys.exit(exitcode)
Example #3
0
def main():

    # read command line options
    opt = parse_args()

    ccmpred.logo.logo(what_for="ccmgen")

    # set OMP environment variable for number of threads
    os.environ['OMP_NUM_THREADS'] = str(opt.num_threads)
    print("Using {0} threads for OMP parallelization.".format(
        os.environ["OMP_NUM_THREADS"]))

    # instantiate CCMpred
    ccm = CCMpred()

    # specify possible file paths
    ccm.set_initraw_file(opt.rawfile)
    ccm.set_pdb_file(opt.pdbfile)

    # read alignment and remove gapped sequences and positions
    if opt.alnfile:
        ccm.set_alignment_file(opt.alnfile)
        ccm.read_alignment(opt.aln_format, opt.max_gap_pos, opt.max_gap_seq)

    #read potentials from binary raw file (possibly remove positions with many gaps)
    ccm.intialise_potentials()
    x = ccmpred.parameter_handling.structured_to_linear(ccm.x_single,
                                                        ccm.x_pair,
                                                        nogapstate=True,
                                                        padding=False)
    ncol = ccm.x_single.shape[0]

    #if MCMC sampling is specified (requires alignment file)
    if opt.mcmc:
        msa_sampled, neff = ccmpred.sampling.generate_mcmc_sample(
            x,
            ccm.msa,
            size=opt.nseq,
            burn_in=opt.mcmc_burn_in,
            sample_type=opt.mcmc_sample_type)

        ids = ["seq {0}".format(i) for i in range(msa_sampled.shape[0])]

    else:

        tree = ccmpred.trees.CCMTree()

        #prepare tree topology
        if opt.tree_file:
            tree.load_tree(opt.tree_file)
        elif opt.tree_source is not None:

            tree.specify_tree(opt.nseq, opt.tree_source)

        ids = tree.ids

        # sample alignment with Neff similar to alignment Neff (requires alignment file and burn-in)
        if opt.mutation_rate_neff:
            msa_sampled, neff = ccmpred.sampling.sample_to_neff_increasingly(
                tree, ccm.neff_entropy, ncol, x, opt.seq0_mrf)
        # sample alignment with specified mutation rate
        elif opt.mutation_rate:
            if opt.seq0_mrf:
                seq0 = ccmpred.trees.get_seq0_mrf(x, ncol, opt.seq0_mrf)
                print(
                    "Ancestor sequence (polyA --> {0} gibbs steps --> seq0) : {1}"
                    .format(
                        opt.seq0_mrf, "".join([
                            ccmpred.io.alignment.AMINO_ACIDS[c]
                            for c in seq0[0]
                        ])))
            elif opt.seq0_file:
                seq0 = ccmpred.io.alignment.read_msa(opt.seq0_file,
                                                     opt.aln_format)
                print("Ancestor sequence: {0}".format("".join(
                    [ccmpred.io.alignment.AMINO_ACIDS[c] for c in seq0[0]])))
            else:
                seq0 = np.zeros((1, ncol), dtype="uint8")

            msa_sampled, neff = ccmpred.sampling.sample_with_mutation_rate(
                tree, seq0, x, opt.mutation_rate)

    # if gappy positions have been removed
    # insert columns with gaps at that position
    if ccm.max_gap_pos < 100:
        msa_sampled = ccmpred.gaps.backinsert_gapped_positions_aln(
            msa_sampled, ccm.gapped_positions)

    print("\nWriting sampled alignment to {0}".format(opt.outalnfile))
    with open(opt.outalnfile, "w") as f:
        descs = [
            "synthetic sequence generated with CCMgen"
            for _ in range(msa_sampled.shape[0])
        ]
        ccmpred.io.alignment.write_msa(f,
                                       msa_sampled,
                                       ids,
                                       is_indices=True,
                                       format=opt.aln_format,
                                       descriptions=descs)
Example #4
0
def main():
    def read_root_sequence(seq0_file, aln_format, print_sequence=True):
        seq0 = ccmpred.io.alignment.read_msa(seq0_file, aln_format)
        seq_N, seq_L = seq0.shape

        if seq_L != ncol:
            print(
                "Length of ancestor sequence must match dimension of MRF model!"
            )
            exit(0)

        if seq_N > 1:
            print(
                "You passed a fasta file with more than one sequence as a root sequences! We took the first sequence."
            )
            print_sequence = True

        if print_sequence:
            print("Ancestor sequence:\n{0}".format("".join(
                [ccmpred.io.alignment.AMINO_ACIDS[c] for c in seq0[0]])))

        return seq0

    # read command line options
    opt = parse_args()

    ccmpred.logo.logo(what_for="ccmgen")

    # set OMP environment variable for number of threads
    os.environ['OMP_NUM_THREADS'] = str(opt.num_threads)
    print("Using {0} threads for OMP parallelization.".format(
        os.environ["OMP_NUM_THREADS"]))

    # instantiate CCMpred
    ccm = CCMpred()

    # specify possible file paths
    ccm.set_initraw_file(opt.rawfile)

    # read alignment and remove gapped sequences and positions
    if opt.alnfile:
        ccm.set_alignment_file(opt.alnfile)
        ccm.read_alignment(opt.aln_format, opt.max_gap_pos, opt.max_gap_seq)

    #read potentials from binary raw file (possibly remove positions with many gaps)
    ccm.intialise_potentials()
    x = ccmpred.parameter_handling.structured_to_linear(ccm.x_single,
                                                        ccm.x_pair,
                                                        nogapstate=True,
                                                        padding=False)
    ncol = ccm.x_single.shape[0]

    #if MCMC sampling is specified
    if opt.mcmc:
        msa_sampled, neff = ccmpred.sampling.generate_mcmc_sample(
            x,
            ncol,
            ccm.msa,
            size=opt.nseq,
            burn_in=opt.mcmc_burn_in,
            sample_type=opt.mcmc_sample_type)

        ids = ["seq {0}".format(i) for i in range(msa_sampled.shape[0])]

    else:

        tree = ccmpred.trees.CCMTree()

        #prepare tree topology
        if opt.tree_file:

            tree.load_tree(opt.tree_file)
            nseq = tree.n_leaves

        else:

            if opt.alnfile:
                nseq = ccm.N
            else:
                nseq = opt.nseq
            tree.specify_tree(nseq, opt.tree_source)

        ids = tree.ids

        # sample alignment with specified mutation rate
        if opt.mutation_rate:
            seq0 = np.zeros((1, ncol), dtype="uint8")

            if opt.seq0_mrf and not opt.seq0_file:
                seq0 = ccmpred.trees.get_seq0_mrf(x, ncol, opt.seq0_mrf)
                print(
                    "Ancestor sequence (polyA --> {0} gibbs steps --> seq0) :\n{1}"
                    .format(
                        opt.seq0_mrf, "".join([
                            ccmpred.io.alignment.AMINO_ACIDS[c]
                            for c in seq0[0]
                        ])))

            elif opt.seq0_file:
                seq0 = read_root_sequence(opt.seq0_file, opt.aln_format)

            msa_sampled, neff = ccmpred.sampling.sample_with_mutation_rate(
                tree, nseq, seq0, x, opt.mutation_rate)

        # sample an alignment that has approximately the specified Neff
        else:
            seq0 = None

            if opt.alnfile:
                neff = ccm.neff_entropy
            else:
                neff = opt.neff

            if opt.seq0_file:
                seq0 = read_root_sequence(opt.seq0_file, opt.aln_format)

            msa_sampled, neff = ccmpred.sampling.sample_to_neff_increasingly(
                tree, nseq, neff, ncol, x, opt.seq0_mrf, root_seq=seq0)

    # if gappy positions have been removed
    # insert columns with gaps at that position
    if ccm.max_gap_pos < 100:
        msa_sampled = ccmpred.gaps.backinsert_gapped_positions_aln(
            msa_sampled, ccm.gapped_positions)

    print("\nWriting sampled alignment to {0}".format(opt.outalnfile))
    with open(opt.outalnfile, "w") as f:
        descs = [
            "synthetic sequence generated with CCMgen"
            for _ in range(msa_sampled.shape[0])
        ]
        ccmpred.io.alignment.write_msa(f,
                                       msa_sampled,
                                       ids,
                                       is_indices=True,
                                       format=opt.aln_format,
                                       descriptions=descs)
def main():

    #Read command line options
    opt = parse_args()

    if opt.logo:
        ccmpred.logo.logo()

    #set OMP environment variable for number of threads
    os.environ['OMP_NUM_THREADS'] = str(opt.num_threads)
    print("Using {0} threads for OMP parallelization.".format(
        os.environ["OMP_NUM_THREADS"]))

    ccm = CCMpred(opt.alnfile, opt.matfile)

    ##############################
    ### Setup
    ##############################

    #read alignment and compute amino acid counts and frequencies
    ccm.read_alignment(opt.aln_format, opt.max_gap_ratio)
    ccm.compute_sequence_weights(opt.weight, opt.wt_ignore_gaps, opt.wt_cutoff)
    ccm.compute_frequencies(opt.pseudocounts, opt.pseudocount_single,
                            opt.pseudocount_pair)

    #if alternative scores are specified: compute these and exit
    if opt.omes:
        ccm.compute_omes(opt.omes_fodoraldrich)
        ccm.write_matrix()
        sys.exit(0)

    if opt.mi:
        ccm.compute_mutual_info(opt.mi_normalized, opt.mi_pseudocounts)
        ccm.write_matrix()
        sys.exit(0)

    #setup L2 regularization
    ccm.specify_regularization(opt.lambda_single,
                               opt.lambda_pair_factor,
                               reg_type=opt.reg_type,
                               scaling=opt.scaling)

    #intialise single and pair potentials either:
    #   - according to regularization priors
    #   - from file
    ccm.intialise_potentials(opt.initrawfile)

    ##############################
    ### Optimize objective function (pLL or CD) with optimization algorithm (CG, GD or ADAM)
    ##############################
    if opt.optimize:

        # specify objective function
        objfun = OBJ_FUNC[opt.objfun](opt, ccm)

        # specify optimizer
        alg = ALGORITHMS[opt.algorithm](opt, ccm)

        #minimize objective function with optimizer
        ccm.minimize(objfun, alg)
    else:
        print(
            "\nDo not optimize but load couplings from binary raw file {0}\n".
            format(opt.initrawfile))

    ##############################
    ### Post Processing
    ##############################

    # Compute contact score (frobenius norm) by possibly recentering potentials
    # TODO: other scores can be added ...
    ccm.compute_contact_matrix(recenter_potentials=opt.centering_potentials,
                               frob=opt.frob)

    # and bias correction to contact score
    ccm.compute_correction(apc=opt.apc,
                           entropy_correction=opt.entropy_correction)

    #specify meta data, and write (corrected) contact matrices to files
    ccm.write_matrix()

    if opt.cd_alnfile and hasattr(ccm.f, 'msa_sampled'):
        ccm.write_sampled_alignment(opt.cd_alnfile)

    if opt.out_binary_raw_file:
        ccm.write_binary_raw(opt.out_binary_raw_file)

    exitcode = 0
    if opt.optimize:
        if ccm.algret['code'] < 0:
            exitcode = -ccm.algret['code']
    sys.exit(exitcode)
Example #6
0
def main(alnfile,outfile,pair_mat):

    # read command line options

    # print logo

    ccmpred.logo.logo()

    # set OMP environment variable for number of threads
    os.environ['OMP_NUM_THREADS'] = str(opt.num_threads)
    print("Using {0} threads for OMP parallelization.".format(os.environ["OMP_NUM_THREADS"]))

    # instantiate CCMpred
    ccm = CCMpred()

    # specify possible file paths
    ccm.set_alignment_file(alnfile)
    ccm.set_matfile(oufile+'.ccmraw')

    # read alignment and possible remove gapped sequences and positions
    ccm.read_alignment()

    # compute sequence weights (in order to reduce sampling bias)
    ccm.compute_sequence_weights("simple", 0.8)

    # compute amino acid counts and frequencies adding pseudo counts for non-observed amino acids
    ccm.compute_frequencies("uniform_pseudocounts")


    # setup L2 regularization
    ccm.specify_regularization(10, 0.2,pair_mat)

    # intialise single and pair potentials either:
    #   - according to regularization priors
    #   - from initrawfile (accounting for removal of many gapped positions, if applicable)
    ccm.intialise_potentials()


    # optimize objective function (pLL or CD/PCD) with optimization algorithm (LBFGS, CG, GD or ADAM)


    #initialize log object
    ccm.initiate_logging()

    #minimize objective function with corresponding optimization algorithm
    ccm.minimize()





    ### Post Processing


    #specify meta data, and write (corrected) contact matrices to files


    # Compute contact score (frobenius norm) by recentering potentials
    # TODO: other scores can be added ...
    ccm.compute_contact_matrix(recenter_potentials=True, frob=True)

    # compute corrected contact maps (removing entropy/phylogenetic biases)
    # TODO: other corrections can be added ...
    ccm.compute_correction(
        apc_file=outfile,
        entropy_correction_file=None
    )

    ccm.write_matrix()

    # write model parameters in binary format



    exitcode = 0
    if ccm.algret['code'] < 0:
        exitcode =-ccm.algret['code']
    sys.exit(exitcode)