def parse_arguments(): import argparse parser = argparse.ArgumentParser( description='Simulate a SELEX experiment.') parser.add_argument( '--energy-model', type=file, help='An energy model to simulate from.') parser.add_argument( '--pwm', type=file, help='A pwm to simulate from.') parser.add_argument( '--sim-sizes', type=int, nargs='+', help='Number of reads to simulate for each round.') parser.add_argument( '--prot-conc', type=float, default=7.75e-10, help='The protein concentration.') parser.add_argument( '--dna-conc', type=float, default=2e-8, help='The DNA concentration.') parser.add_argument( '--random-seed', type=int, help='Set the random number generator seed.') parser.add_argument( '--random-seq-pool-size', type=float, default=1e5, help='The random pool size for the bootstrap.') parser.add_argument( '--verbose', default=False, action='store_true', help='Print extra status information.') args = parser.parse_args() pyTFbindtools.VERBOSE = args.verbose if args.random_seed != None: np.random.seed(args.random_seed) if args.pwm != None: pyTFbindtools.log("Loading PWM starting location", 'VERBOSE') motifs = load_motifs(args.pwm.name) assert len(motifs) == 1, "Motif file contains multiple motifs" motif = motifs.values()[0][0] args.pwm.close() else: assert args.energy_model != None, \ "Either --energy-model or --pwm must be specified" pyTFbindtools.log("Loading energy data", 'VERBOSE') motif = load_energy_data(args.energy_model.name) args.starting_energy_model.close() return ( motif, args.prot_conc, args.dna_conc, args.sim_sizes, int(args.random_seq_pool_size) )
def find_best_shift(rnds_and_seqs, ddg_array, ref_energy): pwm = find_pwm_from_starting_alignment( rnds_and_seqs[-1], build_pwm_from_energies(ddg_array, ref_energy, -12)) left_shift_pwm = find_pwm_from_starting_alignment( rnds_and_seqs[-1], np.hstack((np.zeros((4,1)), pwm.T))) right_shift_pwm = find_pwm_from_starting_alignment( rnds_and_seqs[-1], np.hstack((pwm.T, np.zeros((4,1))))) # calculate the entropies, and shift int he direction that gives the # smallest entropy left_shift_score = -( left_shift_pwm[0,:]*np.log(left_shift_pwm[0,:])).sum() right_shift_score = -( right_shift_pwm[-1,:]*np.log(right_shift_pwm[-1,:])).sum() pyTFbindtools.log("Left shift entropy: %.2e" % left_shift_score, 'VERBOSE') pyTFbindtools.log("Right shift entropy: %.2e" % right_shift_score,'VERBOSE') if left_shift_score < right_shift_score: return "LEFT" else: return "RIGHT"
def simulate_reads( motif, seq_len, sim_sizes, dna_conc, prot_conc, fname_prefix="test", pool_size = 100000): ref_energy, ddg_array = motif.build_ddg_array() chem_pots = est_chem_potentials( ddg_array, ref_energy, dna_conc, prot_conc, 2*(seq_len-len(motif)+1), len(sim_sizes)) current_pool = np.array([np.random.randint(4, size=seq_len) for i in xrange(pool_size)]) rnds_and_seqs = [] for rnd, (sim_size, chem_pot) in enumerate( zip(sim_sizes, chem_pots), start=1): occs = np.array([motif.est_occ(chem_pot, seq) for seq in current_pool]) #print current_pool seq_indices = np.random.choice( len(current_pool), size=sim_size, p=occs/occs.sum(), replace=True) seqs = current_pool[np.array(seq_indices, dtype=int)] seq_occs = occs[np.array(seq_indices, dtype=int)] with open("%s_rnd_%i.txt" % (fname_prefix, rnd), "w") as ofp: for seq in seqs: print >> ofp, "".join('ACGT'[x] for x in seq) current_pool = seqs[np.random.choice( len(seqs), size=pool_size, p=seq_occs/seq_occs.sum(), replace=True)] pyTFbindtools.log( "Finished simulations for round %i" % rnd, level='VERBOSE') pyTFbindtools.log("Finished Simulations", level='VERBOSE') pyTFbindtools.log("Ref Energy: %.2f" % ref_energy, level='VERBOSE') pyTFbindtools.log("Chem Pots: %s" % chem_pots, level='VERBOSE') pyTFbindtools.log(str(ddg_array), level='VERBOSE') return
def fit_model(rnds_and_seqs, ddg_array, ref_energy): opt_path = [] prev_lhd = None for rnd_num in xrange(min(20, len(rnds_and_seqs[0][0])-ddg_array.motif_len+1)): bs_len = ddg_array.motif_len pyTFbindtools.log("Coding sequences", 'VERBOSE') partitioned_and_coded_rnds_and_seqs = PartitionedAndCodedSeqs( rnds_and_seqs, bs_len) pyTFbindtools.log("Estimating energy model", 'VERBOSE') ( ddg_array, ref_energy, chem_pots, lhd_path, lhd_hat ) = estimate_dg_matrix_with_adadelta( partitioned_and_coded_rnds_and_seqs, ddg_array, ref_energy, dna_conc, prot_conc) opt_path.append([bs_len, lhd_hat, ddg_array, ref_energy]) pyTFbindtools.log(ddg_array.consensus_seq(), 'VERBOSE') pyTFbindtools.log("Ref: %s" % ref_energy, 'VERBOSE') pyTFbindtools.log( "Mean: %s" % (ref_energy + ddg_array.sum()/3), 'VERBOSE') pyTFbindtools.log( "Min: %s" % ddg_array.calc_min_energy(ref_energy), 'VERBOSE') pyTFbindtools.log( str(ddg_array.calc_base_contributions().round(2)), 'VERBOSE') ## Old stop criterion #if prev_lhd != None and prev_lhd + 10 > lhd_hat: pyTFbindtools.log("Prev: %.2f\tCurr: %.2f\tDiff: %.2f" % ( lhd_path[0], lhd_path[-1], lhd_path[0]-lhd_path[-1]), 'VERBOSE') if lhd_path[0] + 10 > lhd_path[-1]: pyTFbindtools.log("Model has finished fitting", 'VERBOSE') break # update hte previous likelihood #prev_lhd = lhd_hat pyTFbindtools.log("Finding best shift", 'VERBOSE') shift_type = find_best_shift(rnds_and_seqs, ddg_array, ref_energy) if shift_type == 'LEFT': pyTFbindtools.log("Adding left base to motif", level='VERBOSE' ) ddg_array = np.insert(ddg_array, 0, np.zeros(3, dtype='float32') ).view(DeltaDeltaGArray) elif shift_type == 'RIGHT': pyTFbindtools.log("Adding right base to motif", level='VERBOSE' ) ddg_array = np.append(ddg_array, np.zeros(3, dtype='float32')).view( DeltaDeltaGArray) else: assert False, "Unrecognized shift type '%s'" % shift_type ref_energy = ref_energy for entry in opt_path: print entry return ddg_array, ref_energy
def parse_arguments(): import argparse parser = argparse.ArgumentParser( description='Estimate energy models from a SELEX experiment.') parser.add_argument( '--selex-files', nargs='+', type=file, required=True, help='Files containing SELEX reads.') parser.add_argument( '--background-sequence', type=file, help='File containing reads sequenced from round 0.') parser.add_argument( '--starting-pwm', type=file, help='A PWM to start from.') parser.add_argument( '--starting-energy-model', type=file, help='An energy model to start from.') parser.add_argument( '--initial-binding-site-len', type=int, default=6, help='The starting length of the binding site (this will grow)') parser.add_argument( '--lhd-convergence-eps', type=float, default=1e-8, help='Convergence tolerance for lhd change.') parser.add_argument( '--max-iter', type=float, default=1e5, help='Maximum number of optimization iterations.') parser.add_argument( '--momentum', type=float, default=0.1, help='Optimization tuning param (between 0 and 1).') parser.add_argument( '--random-seed', type=int, help='Set the random number generator seed.') parser.add_argument( '--random-seq-pool-size', type=float, default=1e6, help='The random pool size for the bootstrap.') parser.add_argument( '--verbose', default=False, action='store_true', help='Print extra status information.') parser.add_argument( '--debug-verbose', default=False, action='store_true', help='Print debug information.') args = parser.parse_args() assert not (args.starting_pwm and args.starting_energy_model), \ "Can not set both --starting-pwm and --starting-energy_model" pyTFbindtools.VERBOSE = args.verbose or args.debug_verbose pyTFbindtools.DEBUG = args.debug_verbose pyTFbindtools.selex.CONVERGENCE_MAX_LHD_CHANGE = args.lhd_convergence_eps pyTFbindtools.selex.MAX_NUM_ITER = int(args.max_iter) assert args.momentum < 1 and args.momentum >= 0 pyTFbindtools.selex.MOMENTUM = args.momentum if args.random_seed != None: np.random.seed(args.random_seed) pyTFbindtools.log("Loading sequences", 'VERBOSE') rnds_and_seqs = load_sequences(x.name for x in args.selex_files) if args.starting_pwm != None: pyTFbindtools.log("Loading PWM starting location", 'VERBOSE') motifs = load_motifs(args.starting_pwm) assert len(motifs) == 1, "Motif file contains multiple motifs" motif = motifs.values()[0] args.starting_pwm.close() elif args.starting_energy_model != None: pyTFbindtools.log("Loading energy data", 'VERBOSE') motif = load_energy_data(args.starting_energy_model.name) args.starting_energy_model.close() else: pyTFbindtools.log( "Initializing starting location from %imer search" % args.initial_binding_site_len, 'VERBOSE') factor_name = 'TEST' bs_len = args.initial_binding_site_len pwm = find_pwm(rnds_and_seqs, args.initial_binding_site_len) motif = Motif("aligned_%imer" % args.initial_binding_site_len, factor_name, pwm) return motif, rnds_and_seqs, int(args.random_seq_pool_size)