def compute_correction_terms(alignment_file, binary_raw_file): #initialise ccmpred object ccm = CCMpred() # specify possible file paths ccm.set_alignment_file(alignment_file) ccm.set_initraw_file(binary_raw_file) # read alignment and remove gapped sequences and positions ccm.read_alignment("psicov", 50, 75) # compute sequence weights (in order to reduce sampling bias) ccm.compute_sequence_weights("simple", 0.8) # compute amino acid counts and frequencies adding pseudo counts for non-observed amino acids ccm.compute_frequencies("uniform_pseudocounts", 1, 1) #read in binary raw file ccm.intialise_potentials() #compute apc ccm.recenter_potentials() cmat = contactmatrix.frobenius_score(ccm.x_pair) mean = np.mean(cmat, axis=0) apc_mat = mean[:, np.newaxis] * mean[np.newaxis, :] / np.mean(cmat) #compute entropy correction single_freq = ccm.pseudocounts.freqs[0] nr_states = 20 log = np.log2 scaling_factor, mat_corrected = contactmatrix.compute_local_correction( single_freq, ccm.x_pair, ccm.neff, 1, squared=False, entropy=True, nr_states=nr_states, log=log) entropy_correction_mat = cmat - mat_corrected return apc_mat, entropy_correction_mat
def main(): # read command line options opt = parse_args() # print logo if opt.logo: ccmpred.logo.logo() # set OMP environment variable for number of threads os.environ['OMP_NUM_THREADS'] = str(opt.num_threads) print("Using {0} threads for OMP parallelization.".format( os.environ["OMP_NUM_THREADS"])) # instantiate CCMpred ccm = CCMpred() # specify possible file paths ccm.set_alignment_file(opt.alnfile) ccm.set_matfile(opt.matfile) ccm.set_pdb_file(opt.pdbfile) ccm.set_initraw_file(opt.initrawfile) # read alignment and possible remove gapped sequences and positions ccm.read_alignment(opt.aln_format, opt.max_gap_pos, opt.max_gap_seq) # compute sequence weights (in order to reduce sampling bias) ccm.compute_sequence_weights(opt.weight, opt.wt_cutoff) # compute amino acid counts and frequencies adding pseudo counts for non-observed amino acids ccm.compute_frequencies(opt.pseudocounts, opt.pseudocount_single, opt.pseudocount_pair) # read pdb file if CCMpred is setup as a constrained run if opt.pdbfile: ccm.read_pdb(opt.contact_threshold) # if alternative scores are specified: compute these and exit if opt.omes: ccm.compute_omes(opt.omes_fodoraldrich) ccm.write_matrix() sys.exit(0) if opt.mi: ccm.compute_mutual_info(opt.mi_normalized, opt.mi_pseudocounts) ccm.write_matrix() sys.exit(0) # setup L2 regularization ccm.specify_regularization(opt.lambda_single, opt.lambda_pair_factor, reg_type="L2", scaling="L", single_prior=opt.single_prior) # intialise single and pair potentials either: # - according to regularization priors # - from initrawfile (accounting for removal of many gapped positions, if applicable) ccm.intialise_potentials() # optimize objective function (pLL or CD/PCD) with optimization algorithm (LBFGS, CG, GD or ADAM) if opt.optimize: #initialize log object ccm.initiate_logging(opt.plot_opt_progress) #minimize objective function with corresponding optimization algorithm ccm.minimize(opt) else: print("\nDo not optimize but use model parameters provided by {0}\n". format(opt.initrawfile)) ### Post Processing #specify meta data, and write (corrected) contact matrices to files if opt.matfile: # Compute contact score (frobenius norm) by recentering potentials # TODO: other scores can be added ... ccm.compute_contact_matrix(recenter_potentials=True, frob=True) # compute corrected contact maps (removing entropy/phylogenetic biases) # TODO: other corrections can be added ... ccm.compute_correction( apc_file=opt.apc_file, entropy_correction_file=opt.entropy_correction_file) ccm.write_matrix() # write model parameters in binary format if opt.out_binary_raw_file: ccm.write_binary_raw(opt.out_binary_raw_file) exitcode = 0 if opt.optimize: if ccm.algret['code'] < 0: exitcode = -ccm.algret['code'] sys.exit(exitcode)
def main(): # read command line options opt = parse_args() ccmpred.logo.logo(what_for="ccmgen") # set OMP environment variable for number of threads os.environ['OMP_NUM_THREADS'] = str(opt.num_threads) print("Using {0} threads for OMP parallelization.".format( os.environ["OMP_NUM_THREADS"])) # instantiate CCMpred ccm = CCMpred() # specify possible file paths ccm.set_initraw_file(opt.rawfile) ccm.set_pdb_file(opt.pdbfile) # read alignment and remove gapped sequences and positions if opt.alnfile: ccm.set_alignment_file(opt.alnfile) ccm.read_alignment(opt.aln_format, opt.max_gap_pos, opt.max_gap_seq) #read potentials from binary raw file (possibly remove positions with many gaps) ccm.intialise_potentials() x = ccmpred.parameter_handling.structured_to_linear(ccm.x_single, ccm.x_pair, nogapstate=True, padding=False) ncol = ccm.x_single.shape[0] #if MCMC sampling is specified (requires alignment file) if opt.mcmc: msa_sampled, neff = ccmpred.sampling.generate_mcmc_sample( x, ccm.msa, size=opt.nseq, burn_in=opt.mcmc_burn_in, sample_type=opt.mcmc_sample_type) ids = ["seq {0}".format(i) for i in range(msa_sampled.shape[0])] else: tree = ccmpred.trees.CCMTree() #prepare tree topology if opt.tree_file: tree.load_tree(opt.tree_file) elif opt.tree_source is not None: tree.specify_tree(opt.nseq, opt.tree_source) ids = tree.ids # sample alignment with Neff similar to alignment Neff (requires alignment file and burn-in) if opt.mutation_rate_neff: msa_sampled, neff = ccmpred.sampling.sample_to_neff_increasingly( tree, ccm.neff_entropy, ncol, x, opt.seq0_mrf) # sample alignment with specified mutation rate elif opt.mutation_rate: if opt.seq0_mrf: seq0 = ccmpred.trees.get_seq0_mrf(x, ncol, opt.seq0_mrf) print( "Ancestor sequence (polyA --> {0} gibbs steps --> seq0) : {1}" .format( opt.seq0_mrf, "".join([ ccmpred.io.alignment.AMINO_ACIDS[c] for c in seq0[0] ]))) elif opt.seq0_file: seq0 = ccmpred.io.alignment.read_msa(opt.seq0_file, opt.aln_format) print("Ancestor sequence: {0}".format("".join( [ccmpred.io.alignment.AMINO_ACIDS[c] for c in seq0[0]]))) else: seq0 = np.zeros((1, ncol), dtype="uint8") msa_sampled, neff = ccmpred.sampling.sample_with_mutation_rate( tree, seq0, x, opt.mutation_rate) # if gappy positions have been removed # insert columns with gaps at that position if ccm.max_gap_pos < 100: msa_sampled = ccmpred.gaps.backinsert_gapped_positions_aln( msa_sampled, ccm.gapped_positions) print("\nWriting sampled alignment to {0}".format(opt.outalnfile)) with open(opt.outalnfile, "w") as f: descs = [ "synthetic sequence generated with CCMgen" for _ in range(msa_sampled.shape[0]) ] ccmpred.io.alignment.write_msa(f, msa_sampled, ids, is_indices=True, format=opt.aln_format, descriptions=descs)
def main(): def read_root_sequence(seq0_file, aln_format, print_sequence=True): seq0 = ccmpred.io.alignment.read_msa(seq0_file, aln_format) seq_N, seq_L = seq0.shape if seq_L != ncol: print( "Length of ancestor sequence must match dimension of MRF model!" ) exit(0) if seq_N > 1: print( "You passed a fasta file with more than one sequence as a root sequences! We took the first sequence." ) print_sequence = True if print_sequence: print("Ancestor sequence:\n{0}".format("".join( [ccmpred.io.alignment.AMINO_ACIDS[c] for c in seq0[0]]))) return seq0 # read command line options opt = parse_args() ccmpred.logo.logo(what_for="ccmgen") # set OMP environment variable for number of threads os.environ['OMP_NUM_THREADS'] = str(opt.num_threads) print("Using {0} threads for OMP parallelization.".format( os.environ["OMP_NUM_THREADS"])) # instantiate CCMpred ccm = CCMpred() # specify possible file paths ccm.set_initraw_file(opt.rawfile) # read alignment and remove gapped sequences and positions if opt.alnfile: ccm.set_alignment_file(opt.alnfile) ccm.read_alignment(opt.aln_format, opt.max_gap_pos, opt.max_gap_seq) #read potentials from binary raw file (possibly remove positions with many gaps) ccm.intialise_potentials() x = ccmpred.parameter_handling.structured_to_linear(ccm.x_single, ccm.x_pair, nogapstate=True, padding=False) ncol = ccm.x_single.shape[0] #if MCMC sampling is specified if opt.mcmc: msa_sampled, neff = ccmpred.sampling.generate_mcmc_sample( x, ncol, ccm.msa, size=opt.nseq, burn_in=opt.mcmc_burn_in, sample_type=opt.mcmc_sample_type) ids = ["seq {0}".format(i) for i in range(msa_sampled.shape[0])] else: tree = ccmpred.trees.CCMTree() #prepare tree topology if opt.tree_file: tree.load_tree(opt.tree_file) nseq = tree.n_leaves else: if opt.alnfile: nseq = ccm.N else: nseq = opt.nseq tree.specify_tree(nseq, opt.tree_source) ids = tree.ids # sample alignment with specified mutation rate if opt.mutation_rate: seq0 = np.zeros((1, ncol), dtype="uint8") if opt.seq0_mrf and not opt.seq0_file: seq0 = ccmpred.trees.get_seq0_mrf(x, ncol, opt.seq0_mrf) print( "Ancestor sequence (polyA --> {0} gibbs steps --> seq0) :\n{1}" .format( opt.seq0_mrf, "".join([ ccmpred.io.alignment.AMINO_ACIDS[c] for c in seq0[0] ]))) elif opt.seq0_file: seq0 = read_root_sequence(opt.seq0_file, opt.aln_format) msa_sampled, neff = ccmpred.sampling.sample_with_mutation_rate( tree, nseq, seq0, x, opt.mutation_rate) # sample an alignment that has approximately the specified Neff else: seq0 = None if opt.alnfile: neff = ccm.neff_entropy else: neff = opt.neff if opt.seq0_file: seq0 = read_root_sequence(opt.seq0_file, opt.aln_format) msa_sampled, neff = ccmpred.sampling.sample_to_neff_increasingly( tree, nseq, neff, ncol, x, opt.seq0_mrf, root_seq=seq0) # if gappy positions have been removed # insert columns with gaps at that position if ccm.max_gap_pos < 100: msa_sampled = ccmpred.gaps.backinsert_gapped_positions_aln( msa_sampled, ccm.gapped_positions) print("\nWriting sampled alignment to {0}".format(opt.outalnfile)) with open(opt.outalnfile, "w") as f: descs = [ "synthetic sequence generated with CCMgen" for _ in range(msa_sampled.shape[0]) ] ccmpred.io.alignment.write_msa(f, msa_sampled, ids, is_indices=True, format=opt.aln_format, descriptions=descs)