def predictMutations(theta_file, target_seq, pam_idx, add_null=True): theta, train_set, theta_feature_columns = readTheta(theta_file) #generate indels left_trim = 0 tmp_genindels_file = 'tmp_genindels_%s_%d.txt' % (target_seq, random.randint(0,100000)) cmd = INDELGENTARGET_EXE + ' %s %d %s' % (target_seq, pam_idx, tmp_genindels_file) print(cmd); subprocess.check_call(cmd.split()) rep_reads = fetchRepReads(tmp_genindels_file) isize, smallest_indel = min([(tokFullIndel(x)[1],x) for x in rep_reads]) if len(rep_reads) > 0 else (0,'-') if isize > 0: left_trim = target_seq.find(rep_reads[smallest_indel][:10]) #compute features for all generated indels tmp_features_file = 'tmp_features_%s_%d.txt' % (target_seq, random.randint(0,100000)) calculateFeaturesForGenIndelFile( tmp_genindels_file, target_seq, pam_idx-3, tmp_features_file) os.remove(tmp_genindels_file) feature_data, feature_columns = readFeaturesData(tmp_features_file) os.remove(tmp_features_file) if len(set(theta_feature_columns).difference(set(feature_columns))) != 0: raise Exception('Stored feature names associated with model thetas are not contained in those computed') if len(set(theta_feature_columns).union(set(feature_columns))) != len(theta_feature_columns): feature_data = feature_data[['Indel'] + theta_feature_columns] feature_columns = theta_feature_columns #Predict the profile p_predict, _ = computePredictedProfile(feature_data, theta, theta_feature_columns) in_frame, out_frame, _ = fetchIndelSizeCounts(p_predict) in_frame_perc = in_frame*100.0/(in_frame + out_frame) if add_null: p_predict['-'] = 1000 rep_reads['-'] = target_seq[left_trim:] return p_predict, rep_reads, in_frame_perc
def runAnalysis(guideset_file='model_development_guideset.txt'): guideset = getFullModelDevGuideSet(guideset_file) sample_names = [ 'ST_Feb_2018_CAS9_12NA_1600X_DPI7', 'ST_June_2017_K562_800x_LV7A_DPI7', 'ST_June_2017_K562_800x_LV7B_DPI7' ] feature_columns = loadFeatureLabels([x for x in guideset][0]) if NUM_OLIGO != -1: guideset = random.sample([x for x in guideset], NUM_OLIGO) kf = KFold(n_splits=2) for i, (train_idx, test_idx) in enumerate(kf.split(guideset)): printAndFlush('Cross Validation Fold %d' % (i + 1)) train_set, test_set = np.array(guideset)[train_idx], np.array( guideset)[test_idx] outfile = OUT_THETA_FILE + '_cf%d.txt' % i theta0 = None tmp_file = 'tmp_%s_%d.txt' % (OUT_THETA_FILE, i) if os.path.isfile(tmp_file): printAndFlush('Loading from previous tmp file') theta0, rec_train_set, feature_columns = readTheta(tmp_file) test_set = [ x for x in ([y for y in train_set] + [y for y in test_set]) if x not in rec_train_set ][:int(NUM_OLIGO / 2)] train_set = rec_train_set printAndFlush('Training') theta = trainModelParallel(train_set, sample_names, feature_columns, theta0, cv_idx=i) testModelParallel( theta, train_set, sample_names, feature_columns) #Check final training result with lambda=0 writeTheta(OUT_THETA_FILE + '_cf%d.txt' % i, feature_columns, theta, train_set) recordPredictions(OUT_PROFILE_DIR + '_train_%d' % i, theta, train_set, feature_columns) printAndFlush('Testing') testModelParallel(theta, test_set, sample_names, feature_columns) recordPredictions(OUT_PROFILE_DIR + '_test_%d' % i, theta, test_set, feature_columns)
def computeAndComparePredicted(theta_file, selected_id=None, out_dir='.', start_count=0, end_count=10000): features_dir = getHighDataDir() + '/gen_indels/features_for_gen_indels' theta, train_set, feature_columns = readTheta(theta_file) new_sep_labels = 'New 2x800x', 'New 1600x' old_sep_labels = 'Old 2x800x', 'Old 1600x' #Note: here old refers to conventional scaffold library, new refers to improved scaffold library fout = io.open( out_dir + '/old_new_kl_predicted_summaries.txt' % (start_count, end_count), 'w') fout.write( u'Old Oligo Id\tNew Oligo Id\tOld Mut Reads\tNew Mut Reads\tCombined Mut Reads\t' ) fout.write(u'\t'.join('%s Mut Reads' % x.split('/')[-1] for x in new_sep_labels + old_sep_labels)) fout.write( u'\tOld In Frame Perc\tNew In Frame Perc\tCombined in Frame Perc\tPredicted In Frame Per\t' ) fout.write(u'\t'.join('%s In Frame Perc' % x.split('/')[-1] for x in new_sep_labels + old_sep_labels)) fout.write( u'\tOld v New KL\tOld v Predicted KL\tNew v Predicted KL\tCombined v Predicted KL\t' ) fout.write(u'\t'.join('%s vs Predicted KL' % x.split('/')[-1] for x in new_sep_labels + old_sep_labels) + '\t') fout.write(u'\t'.join([ '%s vs %s KL' % (x.split('/')[-1], y.split('/')[-1]) for x, y in (getCombs(new_sep_labels) + getCombs(old_sep_labels)) ]) + '\n') id_pairs = loadValidationPairs() for (old_id, new_id) in id_pairs: if old_id in train_set or new_id in train_set: raise Exception('Bad!!! Testing on Training data: %s %s' % (old_id, new_id)) if selected_id is not None and selected_id != old_id: continue #Guide pair selected for plotting #Load Old and new profiles, and produce combined profile from the two p_old, p_new, mut_reads_old, mut_reads_new = loadProfilePair( old_id, new_id) p_comb, mut_reads_comb = combineProfiles(p_old, p_new, mut_reads_old, mut_reads_new) #Predict the profile (old and new will be the same so just do one) feature_data = loadOligoFeaturesAndReadCounts(new_id, []) p_predict, _ = computePredictedProfile(feature_data, theta, feature_columns) #Load separate profiles too p_old_sep, p_new_sep, old_sep_mr, new_sep_mr = loadProfilesSeparately( old_id, new_id) #Compute in frame percentages old_if_perc = getInFramePerc(p_old) new_if_perc = getInFramePerc(p_new) comb_if_perc = getInFramePerc(p_comb) pred_if_perc = getInFramePerc(p_predict) new_sep_if_percs = [ getInFramePerc(profile) if len(profile) > 1 else -1 for profile in p_new_sep ] old_sep_if_percs = [ getInFramePerc(profile) if len(profile) > 1 else -1 for profile in p_old_sep ] #Plot the comparison if selected_id is not None: rrds = loadRepReads(new_id) plotProfiles([p_new_sep[0], p_new_sep[1], p_predict], [rrds, rrds, rrds], [56, 56, 56], [False, False, False], ['Replicate 1', 'Replicate 2', 'Predicted'], title='%s (KL=%.2f, KL=%.2f)' % (new_id, symmetricKL(p_new_sep[0], p_new_sep[1]), symmetricKL(p_new, p_predict))) str_args = (symmetricKL(p_old, p_new), symmetricKL(p_old, p_predict), symmetricKL(p_new, p_predict), symmetricKL(p_comb, p_predict)) kl_str = u'\t%.5f\t%.5f\t%.5f\t%.5f\t' % str_args kl_str += u'\t'.join([ '%.5f' % symmetricKL(p_predict, x) for x in p_new_sep + p_old_sep ]) kl_str += u'\t' + u'\t'.join([ '%.5f' % symmetricKL(x, y) for (x, y) in (getCombs(p_new_sep) + getCombs(p_old_sep)) ]) if_str = u'\t'.join( ['%.3f' % x for x in new_sep_if_percs + old_sep_if_percs]) mut_str = u'\t'.join(['%d' % x for x in new_sep_mr + old_sep_mr]) fout.write(u'%s\t%s\t%d\t%d\t%d\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%s%s\n' % (old_id, new_id, mut_reads_old, mut_reads_new, mut_reads_comb, mut_str, old_if_perc, new_if_perc, comb_if_perc, pred_if_perc, if_str, kl_str)) fout.flush() fout.close()
def __init__(self, theta_file): self.theta, self.train_set, self.theta_feature_columns = readTheta( theta_file)
def computeAndComparePredicted(theta_file, selected_id=None, out_dir='.'): features_dir = getHighDataDir() + '/gen_indels/features_for_gen_indels' theta, train_set, feature_columns = readTheta(theta_file) #Note: here old refers to conventional scaffold library, new refers to improved scaffold library fout = io.open(out_dir + '/old_new_kl_predicted_summaries.txt', 'w') fout.write( u'Old Oligo Id\tNew Oligo Id\tOld Mut Reads\tNew Mut Reads\tCombined Mut Reads\tOld In Frame Perc\tNew In Frame Perc\tCombined in Frame Perc\tPredicted In Frame Per' ) fout.write( u'\tOld v New KL\tOld v Predicted KL\tNew v Predicted KL\tCombined v Predicted KL\n' ) id_pairs = loadValidationPairs() for (old_id, new_id) in id_pairs: if old_id in train_set or new_id in train_set: raise Exception('Bad!!! Testing on Training data: %s %s' % (old_id, new_id)) if selected_id is not None and selected_id != old_id: continue #Guide pair selected for plotting #Load Old and new profiles, and produce combined profile from the two p_old, p_new, mut_reads_old, mut_reads_new = loadProfilePair( old_id, new_id) p_comb, mut_reads_comb = combineProfiles(p_old, p_new, mut_reads_old, mut_reads_new) #Predict the profile (old and new will be the same so just do one) feature_data = loadOligoFeaturesAndReadCounts(new_id, []) p_predict, _ = computePredictedProfile(feature_data, theta, feature_columns) #Compute in frame percentages old_if, old_of, _ = fetchIndelSizeCounts(p_old) new_if, new_of, _ = fetchIndelSizeCounts(p_new) comb_if, comb_of, _ = fetchIndelSizeCounts(p_comb) pred_if, pred_of, _ = fetchIndelSizeCounts(p_predict) old_if_perc = old_if * 100.0 / (old_if + old_of) new_if_perc = new_if * 100.0 / (new_if + new_of) comb_if_perc = comb_if * 100.0 / (comb_if + comb_of) pred_if_perc = pred_if * 100.0 / (pred_if + pred_of) #Plot the comparison if selected_id is not None: rrds = loadRepReads(new_id) plotProfiles([p_old, p_new, p_predict], [rrds, rrds, rrds], [42, 42, 42], [False, False, False], ['Replicate 1', 'Replicate 2', 'Predicted'], title='%s (KL=%.2f, KL=%.2f)' % (new_id, symmetricKL( p_old, p_new), symmetricKL(p_comb, p_predict))) str_args = (symmetricKL(p_old, p_new), symmetricKL(p_old, p_predict), symmetricKL(p_new, p_predict), symmetricKL(p_comb, p_predict)) kl_str = u'\t%.5f\t%.5f\t%.5f\t%.5f' % str_args fout.write( u'%s\t%s\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f%s\n' % (old_id, new_id, mut_reads_old, mut_reads_new, mut_reads_comb, old_if_perc, new_if_perc, comb_if_perc, pred_if_perc, kl_str)) fout.flush() fout.close()