Ejemplo n.º 1
0
def main():

    # read command line options
    opt = parse_args()

    # print logo
    if opt.logo:
        ccmpred.logo.logo()

    # set OMP environment variable for number of threads
    os.environ['OMP_NUM_THREADS'] = str(opt.num_threads)
    print("Using {0} threads for OMP parallelization.".format(
        os.environ["OMP_NUM_THREADS"]))

    # instantiate CCMpred
    ccm = CCMpred()

    # specify possible file paths
    ccm.set_alignment_file(opt.alnfile)
    ccm.set_matfile(opt.matfile)
    ccm.set_pdb_file(opt.pdbfile)
    ccm.set_initraw_file(opt.initrawfile)

    # read alignment and possible remove gapped sequences and positions
    ccm.read_alignment(opt.aln_format, opt.max_gap_pos, opt.max_gap_seq)

    # compute sequence weights (in order to reduce sampling bias)
    ccm.compute_sequence_weights(opt.weight, opt.wt_cutoff)

    # compute amino acid counts and frequencies adding pseudo counts for non-observed amino acids
    ccm.compute_frequencies(opt.pseudocounts, opt.pseudocount_single,
                            opt.pseudocount_pair)

    # read pdb file if CCMpred is setup as a constrained run
    if opt.pdbfile:
        ccm.read_pdb(opt.contact_threshold)

    # if alternative scores are specified: compute these and exit
    if opt.omes:
        ccm.compute_omes(opt.omes_fodoraldrich)
        ccm.write_matrix()
        sys.exit(0)

    if opt.mi:
        ccm.compute_mutual_info(opt.mi_normalized, opt.mi_pseudocounts)
        ccm.write_matrix()
        sys.exit(0)

    # setup L2 regularization
    ccm.specify_regularization(opt.lambda_single,
                               opt.lambda_pair_factor,
                               reg_type="L2",
                               scaling="L",
                               single_prior=opt.single_prior)

    # intialise single and pair potentials either:
    #   - according to regularization priors
    #   - from initrawfile (accounting for removal of many gapped positions, if applicable)
    ccm.intialise_potentials()

    # optimize objective function (pLL or CD/PCD) with optimization algorithm (LBFGS, CG, GD or ADAM)
    if opt.optimize:

        #initialize log object
        ccm.initiate_logging(opt.plot_opt_progress)

        #minimize objective function with corresponding optimization algorithm
        ccm.minimize(opt)
    else:
        print("\nDo not optimize but use model parameters provided by {0}\n".
              format(opt.initrawfile))

    ### Post Processing

    #specify meta data, and write (corrected) contact matrices to files
    if opt.matfile:

        # Compute contact score (frobenius norm) by recentering potentials
        # TODO: other scores can be added ...
        ccm.compute_contact_matrix(recenter_potentials=True, frob=True)

        # compute corrected contact maps (removing entropy/phylogenetic biases)
        # TODO: other corrections can be added ...
        ccm.compute_correction(
            apc_file=opt.apc_file,
            entropy_correction_file=opt.entropy_correction_file)

        ccm.write_matrix()

    # write model parameters in binary format
    if opt.out_binary_raw_file:
        ccm.write_binary_raw(opt.out_binary_raw_file)

    exitcode = 0
    if opt.optimize:
        if ccm.algret['code'] < 0:
            exitcode = -ccm.algret['code']
    sys.exit(exitcode)
Ejemplo n.º 2
0
def main():

    # read command line options
    opt = parse_args()

    ccmpred.logo.logo(what_for="ccmgen")

    # set OMP environment variable for number of threads
    os.environ['OMP_NUM_THREADS'] = str(opt.num_threads)
    print("Using {0} threads for OMP parallelization.".format(
        os.environ["OMP_NUM_THREADS"]))

    # instantiate CCMpred
    ccm = CCMpred()

    # specify possible file paths
    ccm.set_initraw_file(opt.rawfile)
    ccm.set_pdb_file(opt.pdbfile)

    # read alignment and remove gapped sequences and positions
    if opt.alnfile:
        ccm.set_alignment_file(opt.alnfile)
        ccm.read_alignment(opt.aln_format, opt.max_gap_pos, opt.max_gap_seq)

    #read potentials from binary raw file (possibly remove positions with many gaps)
    ccm.intialise_potentials()
    x = ccmpred.parameter_handling.structured_to_linear(ccm.x_single,
                                                        ccm.x_pair,
                                                        nogapstate=True,
                                                        padding=False)
    ncol = ccm.x_single.shape[0]

    #if MCMC sampling is specified (requires alignment file)
    if opt.mcmc:
        msa_sampled, neff = ccmpred.sampling.generate_mcmc_sample(
            x,
            ccm.msa,
            size=opt.nseq,
            burn_in=opt.mcmc_burn_in,
            sample_type=opt.mcmc_sample_type)

        ids = ["seq {0}".format(i) for i in range(msa_sampled.shape[0])]

    else:

        tree = ccmpred.trees.CCMTree()

        #prepare tree topology
        if opt.tree_file:
            tree.load_tree(opt.tree_file)
        elif opt.tree_source is not None:

            tree.specify_tree(opt.nseq, opt.tree_source)

        ids = tree.ids

        # sample alignment with Neff similar to alignment Neff (requires alignment file and burn-in)
        if opt.mutation_rate_neff:
            msa_sampled, neff = ccmpred.sampling.sample_to_neff_increasingly(
                tree, ccm.neff_entropy, ncol, x, opt.seq0_mrf)
        # sample alignment with specified mutation rate
        elif opt.mutation_rate:
            if opt.seq0_mrf:
                seq0 = ccmpred.trees.get_seq0_mrf(x, ncol, opt.seq0_mrf)
                print(
                    "Ancestor sequence (polyA --> {0} gibbs steps --> seq0) : {1}"
                    .format(
                        opt.seq0_mrf, "".join([
                            ccmpred.io.alignment.AMINO_ACIDS[c]
                            for c in seq0[0]
                        ])))
            elif opt.seq0_file:
                seq0 = ccmpred.io.alignment.read_msa(opt.seq0_file,
                                                     opt.aln_format)
                print("Ancestor sequence: {0}".format("".join(
                    [ccmpred.io.alignment.AMINO_ACIDS[c] for c in seq0[0]])))
            else:
                seq0 = np.zeros((1, ncol), dtype="uint8")

            msa_sampled, neff = ccmpred.sampling.sample_with_mutation_rate(
                tree, seq0, x, opt.mutation_rate)

    # if gappy positions have been removed
    # insert columns with gaps at that position
    if ccm.max_gap_pos < 100:
        msa_sampled = ccmpred.gaps.backinsert_gapped_positions_aln(
            msa_sampled, ccm.gapped_positions)

    print("\nWriting sampled alignment to {0}".format(opt.outalnfile))
    with open(opt.outalnfile, "w") as f:
        descs = [
            "synthetic sequence generated with CCMgen"
            for _ in range(msa_sampled.shape[0])
        ]
        ccmpred.io.alignment.write_msa(f,
                                       msa_sampled,
                                       ids,
                                       is_indices=True,
                                       format=opt.aln_format,
                                       descriptions=descs)