Beispiel #1
0
def predictMutations(theta_file, target_seq, pam_idx, add_null=True):

    theta, train_set, theta_feature_columns = readTheta(theta_file)

    #generate indels
    left_trim = 0
    tmp_genindels_file = 'tmp_genindels_%s_%d.txt' % (target_seq, random.randint(0,100000))
    cmd = INDELGENTARGET_EXE + ' %s %d %s' % (target_seq, pam_idx, tmp_genindels_file)
    print(cmd); subprocess.check_call(cmd.split())
    rep_reads = fetchRepReads(tmp_genindels_file)
    isize, smallest_indel = min([(tokFullIndel(x)[1],x) for x in rep_reads]) if len(rep_reads) > 0 else (0,'-') 
    if isize > 0: left_trim = target_seq.find(rep_reads[smallest_indel][:10])

    #compute features for all generated indels
    tmp_features_file = 'tmp_features_%s_%d.txt' % (target_seq, random.randint(0,100000))
    calculateFeaturesForGenIndelFile( tmp_genindels_file, target_seq, pam_idx-3, tmp_features_file)
    os.remove(tmp_genindels_file)
    feature_data, feature_columns = readFeaturesData(tmp_features_file)
    os.remove(tmp_features_file)

    if len(set(theta_feature_columns).difference(set(feature_columns))) != 0:
        raise Exception('Stored feature names associated with model thetas are not contained in those computed')

    if len(set(theta_feature_columns).union(set(feature_columns))) != len(theta_feature_columns):
        feature_data = feature_data[['Indel'] + theta_feature_columns]
        feature_columns = theta_feature_columns

    #Predict the profile
    p_predict, _ = computePredictedProfile(feature_data, theta, theta_feature_columns)
    in_frame, out_frame, _ = fetchIndelSizeCounts(p_predict)
    in_frame_perc = in_frame*100.0/(in_frame + out_frame)
    if add_null:
        p_predict['-'] = 1000
        rep_reads['-'] = target_seq[left_trim:]
    return p_predict, rep_reads, in_frame_perc
Beispiel #2
0
def runAnalysis(guideset_file='model_development_guideset.txt'):

    guideset = getFullModelDevGuideSet(guideset_file)
    sample_names = [
        'ST_Feb_2018_CAS9_12NA_1600X_DPI7', 'ST_June_2017_K562_800x_LV7A_DPI7',
        'ST_June_2017_K562_800x_LV7B_DPI7'
    ]

    feature_columns = loadFeatureLabels([x for x in guideset][0])
    if NUM_OLIGO != -1:
        guideset = random.sample([x for x in guideset], NUM_OLIGO)

    kf = KFold(n_splits=2)
    for i, (train_idx, test_idx) in enumerate(kf.split(guideset)):
        printAndFlush('Cross Validation Fold %d' % (i + 1))
        train_set, test_set = np.array(guideset)[train_idx], np.array(
            guideset)[test_idx]

        outfile = OUT_THETA_FILE + '_cf%d.txt' % i

        theta0 = None
        tmp_file = 'tmp_%s_%d.txt' % (OUT_THETA_FILE, i)
        if os.path.isfile(tmp_file):
            printAndFlush('Loading from previous tmp file')
            theta0, rec_train_set, feature_columns = readTheta(tmp_file)
            test_set = [
                x for x in ([y for y in train_set] + [y for y in test_set])
                if x not in rec_train_set
            ][:int(NUM_OLIGO / 2)]
            train_set = rec_train_set

        printAndFlush('Training')
        theta = trainModelParallel(train_set,
                                   sample_names,
                                   feature_columns,
                                   theta0,
                                   cv_idx=i)
        testModelParallel(
            theta, train_set, sample_names,
            feature_columns)  #Check final training result with lambda=0
        writeTheta(OUT_THETA_FILE + '_cf%d.txt' % i, feature_columns, theta,
                   train_set)
        recordPredictions(OUT_PROFILE_DIR + '_train_%d' % i, theta, train_set,
                          feature_columns)

        printAndFlush('Testing')
        testModelParallel(theta, test_set, sample_names, feature_columns)
        recordPredictions(OUT_PROFILE_DIR + '_test_%d' % i, theta, test_set,
                          feature_columns)
Beispiel #3
0
def computeAndComparePredicted(theta_file,
                               selected_id=None,
                               out_dir='.',
                               start_count=0,
                               end_count=10000):

    features_dir = getHighDataDir() + '/gen_indels/features_for_gen_indels'
    theta, train_set, feature_columns = readTheta(theta_file)

    new_sep_labels = 'New 2x800x', 'New 1600x'
    old_sep_labels = 'Old 2x800x', 'Old 1600x'

    #Note: here old refers to conventional scaffold library, new refers to improved scaffold library
    fout = io.open(
        out_dir + '/old_new_kl_predicted_summaries.txt' %
        (start_count, end_count), 'w')
    fout.write(
        u'Old Oligo Id\tNew Oligo Id\tOld Mut Reads\tNew Mut Reads\tCombined Mut Reads\t'
    )
    fout.write(u'\t'.join('%s Mut Reads' % x.split('/')[-1]
                          for x in new_sep_labels + old_sep_labels))
    fout.write(
        u'\tOld In Frame Perc\tNew In Frame Perc\tCombined in Frame Perc\tPredicted In Frame Per\t'
    )
    fout.write(u'\t'.join('%s In Frame Perc' % x.split('/')[-1]
                          for x in new_sep_labels + old_sep_labels))
    fout.write(
        u'\tOld v New KL\tOld v Predicted KL\tNew v Predicted KL\tCombined v Predicted KL\t'
    )
    fout.write(u'\t'.join('%s vs Predicted KL' % x.split('/')[-1]
                          for x in new_sep_labels + old_sep_labels) + '\t')
    fout.write(u'\t'.join([
        '%s vs %s KL' % (x.split('/')[-1], y.split('/')[-1])
        for x, y in (getCombs(new_sep_labels) + getCombs(old_sep_labels))
    ]) + '\n')

    id_pairs = loadValidationPairs()
    for (old_id, new_id) in id_pairs:
        if old_id in train_set or new_id in train_set:
            raise Exception('Bad!!! Testing on Training data: %s %s' %
                            (old_id, new_id))

        if selected_id is not None and selected_id != old_id:
            continue  #Guide pair selected for plotting

        #Load Old and new profiles, and produce combined profile from the two
        p_old, p_new, mut_reads_old, mut_reads_new = loadProfilePair(
            old_id, new_id)
        p_comb, mut_reads_comb = combineProfiles(p_old, p_new, mut_reads_old,
                                                 mut_reads_new)

        #Predict the profile (old and new will be the same so just do one)
        feature_data = loadOligoFeaturesAndReadCounts(new_id, [])
        p_predict, _ = computePredictedProfile(feature_data, theta,
                                               feature_columns)

        #Load separate profiles too
        p_old_sep, p_new_sep, old_sep_mr, new_sep_mr = loadProfilesSeparately(
            old_id, new_id)

        #Compute in frame percentages
        old_if_perc = getInFramePerc(p_old)
        new_if_perc = getInFramePerc(p_new)
        comb_if_perc = getInFramePerc(p_comb)
        pred_if_perc = getInFramePerc(p_predict)
        new_sep_if_percs = [
            getInFramePerc(profile) if len(profile) > 1 else -1
            for profile in p_new_sep
        ]
        old_sep_if_percs = [
            getInFramePerc(profile) if len(profile) > 1 else -1
            for profile in p_old_sep
        ]

        #Plot the comparison
        if selected_id is not None:
            rrds = loadRepReads(new_id)
            plotProfiles([p_new_sep[0], p_new_sep[1], p_predict],
                         [rrds, rrds, rrds], [56, 56, 56],
                         [False, False, False],
                         ['Replicate 1', 'Replicate 2', 'Predicted'],
                         title='%s (KL=%.2f, KL=%.2f)' %
                         (new_id, symmetricKL(p_new_sep[0], p_new_sep[1]),
                          symmetricKL(p_new, p_predict)))

        str_args = (symmetricKL(p_old, p_new), symmetricKL(p_old, p_predict),
                    symmetricKL(p_new,
                                p_predict), symmetricKL(p_comb, p_predict))
        kl_str = u'\t%.5f\t%.5f\t%.5f\t%.5f\t' % str_args
        kl_str += u'\t'.join([
            '%.5f' % symmetricKL(p_predict, x) for x in p_new_sep + p_old_sep
        ])
        kl_str += u'\t' + u'\t'.join([
            '%.5f' % symmetricKL(x, y)
            for (x, y) in (getCombs(p_new_sep) + getCombs(p_old_sep))
        ])
        if_str = u'\t'.join(
            ['%.3f' % x for x in new_sep_if_percs + old_sep_if_percs])
        mut_str = u'\t'.join(['%d' % x for x in new_sep_mr + old_sep_mr])
        fout.write(u'%s\t%s\t%d\t%d\t%d\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%s%s\n' %
                   (old_id, new_id, mut_reads_old, mut_reads_new,
                    mut_reads_comb, mut_str, old_if_perc, new_if_perc,
                    comb_if_perc, pred_if_perc, if_str, kl_str))
        fout.flush()
    fout.close()
Beispiel #4
0
 def __init__(self, theta_file):
     self.theta, self.train_set, self.theta_feature_columns = readTheta(
         theta_file)
Beispiel #5
0
def computeAndComparePredicted(theta_file, selected_id=None, out_dir='.'):

    features_dir = getHighDataDir() + '/gen_indels/features_for_gen_indels'
    theta, train_set, feature_columns = readTheta(theta_file)

    #Note: here old refers to conventional scaffold library, new refers to improved scaffold library
    fout = io.open(out_dir + '/old_new_kl_predicted_summaries.txt', 'w')
    fout.write(
        u'Old Oligo Id\tNew Oligo Id\tOld Mut Reads\tNew Mut Reads\tCombined Mut Reads\tOld In Frame Perc\tNew In Frame Perc\tCombined in Frame Perc\tPredicted In Frame Per'
    )
    fout.write(
        u'\tOld v New KL\tOld v Predicted KL\tNew v Predicted KL\tCombined v Predicted KL\n'
    )

    id_pairs = loadValidationPairs()
    for (old_id, new_id) in id_pairs:
        if old_id in train_set or new_id in train_set:
            raise Exception('Bad!!! Testing on Training data: %s %s' %
                            (old_id, new_id))

        if selected_id is not None and selected_id != old_id:
            continue  #Guide pair selected for plotting

        #Load Old and new profiles, and produce combined profile from the two
        p_old, p_new, mut_reads_old, mut_reads_new = loadProfilePair(
            old_id, new_id)
        p_comb, mut_reads_comb = combineProfiles(p_old, p_new, mut_reads_old,
                                                 mut_reads_new)

        #Predict the profile (old and new will be the same so just do one)
        feature_data = loadOligoFeaturesAndReadCounts(new_id, [])
        p_predict, _ = computePredictedProfile(feature_data, theta,
                                               feature_columns)

        #Compute in frame percentages
        old_if, old_of, _ = fetchIndelSizeCounts(p_old)
        new_if, new_of, _ = fetchIndelSizeCounts(p_new)
        comb_if, comb_of, _ = fetchIndelSizeCounts(p_comb)
        pred_if, pred_of, _ = fetchIndelSizeCounts(p_predict)
        old_if_perc = old_if * 100.0 / (old_if + old_of)
        new_if_perc = new_if * 100.0 / (new_if + new_of)
        comb_if_perc = comb_if * 100.0 / (comb_if + comb_of)
        pred_if_perc = pred_if * 100.0 / (pred_if + pred_of)

        #Plot the comparison
        if selected_id is not None:
            rrds = loadRepReads(new_id)
            plotProfiles([p_old, p_new, p_predict], [rrds, rrds, rrds],
                         [42, 42, 42], [False, False, False],
                         ['Replicate 1', 'Replicate 2', 'Predicted'],
                         title='%s (KL=%.2f, KL=%.2f)' %
                         (new_id, symmetricKL(
                             p_old, p_new), symmetricKL(p_comb, p_predict)))

        str_args = (symmetricKL(p_old, p_new), symmetricKL(p_old, p_predict),
                    symmetricKL(p_new,
                                p_predict), symmetricKL(p_comb, p_predict))
        kl_str = u'\t%.5f\t%.5f\t%.5f\t%.5f' % str_args
        fout.write(
            u'%s\t%s\t%d\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f%s\n' %
            (old_id, new_id, mut_reads_old, mut_reads_new, mut_reads_comb,
             old_if_perc, new_if_perc, comb_if_perc, pred_if_perc, kl_str))
        fout.flush()
    fout.close()