def parse_arguments():
    import argparse
    parser = argparse.ArgumentParser(
        description='Simulate a SELEX experiment.')

    parser.add_argument( '--energy-model', type=file,
                         help='An energy model to simulate from.')
    parser.add_argument( '--pwm', type=file,
                         help='A pwm to simulate from.')
    
    parser.add_argument( '--sim-sizes', type=int, nargs='+',
                         help='Number of reads to simulate for each round.')    
    
    parser.add_argument( '--prot-conc', type=float, default=7.75e-10,
                         help='The protein concentration.')
    parser.add_argument( '--dna-conc', type=float, default=2e-8,
                         help='The DNA concentration.')
    
    parser.add_argument( '--random-seed', type=int,
                         help='Set the random number generator seed.')
    parser.add_argument( '--random-seq-pool-size', type=float, default=1e5,
        help='The random pool size for the bootstrap.')
    
    parser.add_argument( '--verbose', default=False, action='store_true',
                         help='Print extra status information.')
    
    args = parser.parse_args()

    pyTFbindtools.VERBOSE = args.verbose
    
    if args.random_seed != None:
        np.random.seed(args.random_seed)

    if args.pwm != None:
        pyTFbindtools.log("Loading PWM starting location", 'VERBOSE')
        motifs = load_motifs(args.pwm.name)
        assert len(motifs) == 1, "Motif file contains multiple motifs"
        motif = motifs.values()[0][0]
        args.pwm.close()
    else:
        assert args.energy_model != None, \
            "Either --energy-model or --pwm must be specified"
        pyTFbindtools.log("Loading energy data", 'VERBOSE')
        motif = load_energy_data(args.energy_model.name)
        args.starting_energy_model.close()
    
    return ( motif, args.prot_conc, args.dna_conc, 
             args.sim_sizes,
             int(args.random_seq_pool_size) )
Beispiel #2
0
def find_best_shift(rnds_and_seqs, ddg_array, ref_energy):
    pwm = find_pwm_from_starting_alignment(
        rnds_and_seqs[-1], build_pwm_from_energies(ddg_array, ref_energy, -12))
    left_shift_pwm = find_pwm_from_starting_alignment(
        rnds_and_seqs[-1], np.hstack((np.zeros((4,1)), pwm.T)))    
    right_shift_pwm = find_pwm_from_starting_alignment(
        rnds_and_seqs[-1], np.hstack((pwm.T, np.zeros((4,1)))))    
    # calculate the entropies, and shift int he direction that gives the
    # smallest entropy
    left_shift_score = -(
        left_shift_pwm[0,:]*np.log(left_shift_pwm[0,:])).sum()
    right_shift_score = -(
        right_shift_pwm[-1,:]*np.log(right_shift_pwm[-1,:])).sum()
    pyTFbindtools.log("Left shift entropy: %.2e" % left_shift_score, 'VERBOSE')
    pyTFbindtools.log("Right shift entropy: %.2e" % right_shift_score,'VERBOSE')
    if left_shift_score < right_shift_score:
        return "LEFT"
    else:
        return "RIGHT"
def simulate_reads( motif, seq_len, sim_sizes,
                    dna_conc, prot_conc,
                    fname_prefix="test",
                    pool_size = 100000):
    ref_energy, ddg_array = motif.build_ddg_array()
    chem_pots = est_chem_potentials(
        ddg_array, ref_energy, dna_conc, prot_conc, 
        2*(seq_len-len(motif)+1), len(sim_sizes))
    current_pool = np.array([np.random.randint(4, size=seq_len)
                             for i in xrange(pool_size)])
    rnds_and_seqs = []
    for rnd, (sim_size, chem_pot) in enumerate(
            zip(sim_sizes, chem_pots), start=1):
        occs = np.array([motif.est_occ(chem_pot, seq)
                         for seq in current_pool])
        #print current_pool
        seq_indices = np.random.choice(
            len(current_pool), size=sim_size,
            p=occs/occs.sum(), replace=True)
        seqs = current_pool[np.array(seq_indices, dtype=int)]
        seq_occs = occs[np.array(seq_indices, dtype=int)]

        with open("%s_rnd_%i.txt" % (fname_prefix, rnd), "w") as ofp:
            for seq in seqs:
                print >> ofp, "".join('ACGT'[x] for x in seq)
        current_pool = seqs[np.random.choice(
            len(seqs), size=pool_size,
            p=seq_occs/seq_occs.sum(), replace=True)]
        pyTFbindtools.log( 
            "Finished simulations for round %i" % rnd, level='VERBOSE')
    
    pyTFbindtools.log("Finished Simulations", level='VERBOSE')
    pyTFbindtools.log("Ref Energy: %.2f" % ref_energy, level='VERBOSE')
    pyTFbindtools.log("Chem Pots: %s" % chem_pots, level='VERBOSE')
    pyTFbindtools.log(str(ddg_array), level='VERBOSE')
    return
Beispiel #4
0
def fit_model(rnds_and_seqs, ddg_array, ref_energy):
    opt_path = []
    prev_lhd = None
    for rnd_num in xrange(min(20, 
                              len(rnds_and_seqs[0][0])-ddg_array.motif_len+1)):
        bs_len = ddg_array.motif_len
        pyTFbindtools.log("Coding sequences", 'VERBOSE')
        partitioned_and_coded_rnds_and_seqs = PartitionedAndCodedSeqs(
            rnds_and_seqs, bs_len)

        pyTFbindtools.log("Estimating energy model", 'VERBOSE')
        ( ddg_array, ref_energy, chem_pots, lhd_path, lhd_hat 
            ) = estimate_dg_matrix_with_adadelta(
                partitioned_and_coded_rnds_and_seqs,
                ddg_array, ref_energy,
                dna_conc, prot_conc)

        opt_path.append([bs_len, lhd_hat, ddg_array, ref_energy])

        pyTFbindtools.log(ddg_array.consensus_seq(), 'VERBOSE')
        pyTFbindtools.log("Ref: %s" % ref_energy, 'VERBOSE')
        pyTFbindtools.log(
            "Mean: %s" % (ref_energy + ddg_array.sum()/3), 'VERBOSE')
        pyTFbindtools.log(
            "Min: %s" % ddg_array.calc_min_energy(ref_energy), 'VERBOSE')
        pyTFbindtools.log(
            str(ddg_array.calc_base_contributions().round(2)), 'VERBOSE')

        ## Old stop criterion
        #if prev_lhd != None and prev_lhd + 10 > lhd_hat:

        pyTFbindtools.log("Prev: %.2f\tCurr: %.2f\tDiff: %.2f" % (
            lhd_path[0], lhd_path[-1], lhd_path[0]-lhd_path[-1]), 'VERBOSE')

        if lhd_path[0] + 10 > lhd_path[-1]:        
            pyTFbindtools.log("Model has finished fitting", 'VERBOSE')
            break
        
        # update hte previous likelihood
        #prev_lhd = lhd_hat
        
        pyTFbindtools.log("Finding best shift", 'VERBOSE')
        shift_type = find_best_shift(rnds_and_seqs, ddg_array, ref_energy)
        if shift_type == 'LEFT':
            pyTFbindtools.log("Adding left base to motif", level='VERBOSE' )
            ddg_array = np.insert(ddg_array, 0, np.zeros(3, dtype='float32')
                              ).view(DeltaDeltaGArray)
        elif shift_type == 'RIGHT':
            pyTFbindtools.log("Adding right base to motif", level='VERBOSE' )
            ddg_array = np.append(ddg_array, np.zeros(3, dtype='float32')).view(
                DeltaDeltaGArray)
        else:
            assert False, "Unrecognized shift type '%s'" % shift_type
        ref_energy = ref_energy
        
    for entry in opt_path:
        print entry
    
    return ddg_array, ref_energy
Beispiel #5
0
def parse_arguments():
    import argparse
    parser = argparse.ArgumentParser(
        description='Estimate energy models from a SELEX experiment.')

    parser.add_argument( '--selex-files', nargs='+', type=file, required=True,
                         help='Files containing SELEX reads.')

    parser.add_argument( '--background-sequence', type=file, 
        help='File containing reads sequenced from round 0.')

    parser.add_argument( '--starting-pwm', type=file,
                         help='A PWM to start from.')
    parser.add_argument( '--starting-energy-model', type=file,
                         help='An energy model to start from.')
    parser.add_argument( '--initial-binding-site-len', type=int, default=6,
        help='The starting length of the binding site (this will grow)')

    parser.add_argument( '--lhd-convergence-eps', type=float, default=1e-8,
                         help='Convergence tolerance for lhd change.')
    parser.add_argument( '--max-iter', type=float, default=1e5,
                         help='Maximum number of optimization iterations.')
    parser.add_argument( '--momentum', type=float, default=0.1,
                         help='Optimization tuning param (between 0 and 1).')

    parser.add_argument( '--random-seed', type=int,
                         help='Set the random number generator seed.')
    parser.add_argument( '--random-seq-pool-size', type=float, default=1e6,
        help='The random pool size for the bootstrap.')


    parser.add_argument( '--verbose', default=False, action='store_true',
                         help='Print extra status information.')
    parser.add_argument( '--debug-verbose', default=False, action='store_true',
                         help='Print debug information.')
    
    args = parser.parse_args()
    assert not (args.starting_pwm and args.starting_energy_model), \
            "Can not set both --starting-pwm and --starting-energy_model"

    pyTFbindtools.VERBOSE = args.verbose or args.debug_verbose
    pyTFbindtools.DEBUG = args.debug_verbose

    pyTFbindtools.selex.CONVERGENCE_MAX_LHD_CHANGE = args.lhd_convergence_eps
    pyTFbindtools.selex.MAX_NUM_ITER = int(args.max_iter)
    assert args.momentum < 1 and args.momentum >= 0
    pyTFbindtools.selex.MOMENTUM = args.momentum
    
    if args.random_seed != None:
        np.random.seed(args.random_seed)

    pyTFbindtools.log("Loading sequences", 'VERBOSE')
    rnds_and_seqs = load_sequences(x.name for x in args.selex_files)

    if args.starting_pwm != None:
        pyTFbindtools.log("Loading PWM starting location", 'VERBOSE')
        motifs = load_motifs(args.starting_pwm)
        assert len(motifs) == 1, "Motif file contains multiple motifs"
        motif = motifs.values()[0]
        args.starting_pwm.close()
    elif args.starting_energy_model != None:
        pyTFbindtools.log("Loading energy data", 'VERBOSE')
        motif = load_energy_data(args.starting_energy_model.name)
        args.starting_energy_model.close()
    else:
        pyTFbindtools.log(
            "Initializing starting location from %imer search" % args.initial_binding_site_len, 
            'VERBOSE')
        factor_name = 'TEST'
        bs_len = args.initial_binding_site_len
        pwm = find_pwm(rnds_and_seqs, args.initial_binding_site_len)
        motif = Motif("aligned_%imer" % args.initial_binding_site_len, 
                      factor_name, pwm)
    
    return motif, rnds_and_seqs, int(args.random_seq_pool_size)