Ejemplo n.º 1
0
def readBafSamples(baffile, bafrawdata):
    labels = []
    allsamples = []
    if BAF_links:
        baffile = readlink(BAF_dir + patient + "_BAF.txt")
    else:
        baffile = BAF_dir + patient + "_BAF.txt"
    if not(isfile(baffile)):
        print("ERROR:  no BAF file found for patient", patient)
        bafrawdata = {}
        return
    print("Reading BAF sample data for patient", patient)
    baffile = open(baffile, "r")
    allbafs = {}
    allbafs["cnvi"] = []
    allbafs["normal"] = []
    for line in baffile:
        lvec = line.split()
        if line.find("Chr") != -1:
            labels = lvec
            for l in range(2,len(labels)):
                allsamples.append(labels[l].split('"')[1])
            continue
        chr = lvec[1].split('"')[1]
        pos = int(lvec[2])
        if chr not in bafrawdata:
            continue
        if pos not in bafrawdata[chr]:
            continue
        for p in range(3,len(lvec)):
            sample = labels[p-1].split('"')[1]
            try:
                bafrawdata[chr][pos][sample] = float(lvec[p])
                if line.find("cnvi") != -1:
                    allbafs["cnvi"].append(float(lvec[p]))
                else:
                    bafrawdata[chr][pos][sample] = float(lvec[p])
                    allbafs["normal"].append(float(lvec[p]))
            except:
                continue
    baffile.close()
    if showgraphs:
        print("Sample CNVI bafs for normal CNVIs that were 0.5 in wt:")
        lsl.createPrintAndSaveHistogram(allbafs["cnvi"], "", .01)
#    print("all other bafs:")
#    lsl.createPrintAndSaveHistogram(allbafs["normal"], "", .01)
    return allbafs, allsamples
                loss_data.append(avg_log2r)
            #loss_data.append(avg_log2r)
        elif (call == "wt"):
            wt_data.append(avg_log2r)
        elif (call == "Gain"):
            gain_data.append(avg_log2r)
        elif (call == "Balanced_gain"):
            balanced_gain_data.append(avg_log2r)
        else:
            print "Unknown call ", call

binwidth = 0.001

print "Double-loss from doubled genomes histogram:"
lsl.createPrintAndSaveHistogram(
    double_loss_from_doubled_data,
    "CN_rejoined_histograms/double_loss_from_doubled_hist.txt", binwidth)

print "Loss from doubled genomes histogram:"
lsl.createPrintAndSaveHistogram(
    loss_from_doubled_data,
    "CN_rejoined_histograms/loss_from_doubled_hist.txt", binwidth)

print "Double-loss histogram:"
lsl.createPrintAndSaveHistogram(double_loss_data,
                                "CN_rejoined_histograms/double_loss.txt",
                                binwidth)

print "Loss histogram:"
lsl.createPrintAndSaveHistogram(loss_data, "CN_rejoined_histograms/loss.txt",
                                binwidth)
            if lvec[n] != "":
                if call not in data[group]:
                    data[group][call] = []
                data[group][call].append(float(lvec[n]))
    for label in data:
        for call in data[label]:
            if len(data[label][call]) == 0:
                continue
            if len(data[label][call]) < 100:
                #Skip groups with fewer than 100 VAFs.
                continue
            filename = patient + "_" + sample + "_" + makeFilename(
                label) + "_" + str(call[0]) + "_" + str(call[1]) + "_hist.png"
            hist = lsl.createPrintAndSaveHistogram(data[label][call],
                                                   filename,
                                                   0.001,
                                                   xdata="VAF",
                                                   savefig=False,
                                                   show=False)
            mean = numpy.mean(data[label][call])
            stdev = numpy.std(data[label][call])
            histmaxes = getHistMaxes(hist)
            print(patient, sample, label, call)
            ###THIS IS WHERE YOU FIND THE HISTOGRAM PEAKS###
            ##Data:  data[label][call]
            ##Peaks:  histmaxes
            ##Peak heights:  hist[histmaxes[n]]
            ##Stdev:  stdev

            emdata = mixture.DataSet()
            emdata.fromList(data[label][call])
            numpeaks = len(histmaxes)
Ejemplo n.º 4
0
        elif (nmarkers < 21):
            data13_20.append(meanlog2r)
        elif (nmarkers < 51):
            data21_50.append(meanlog2r)
        elif (nmarkers < 501):
            data51_500.append(meanlog2r)
        elif (nmarkers < 5001):
            data501_5000.append(meanlog2r)
        elif (nmarkers < 50001):
            data5001_50000.append(meanlog2r)
        elif (nmarkers < 500001):
            data50001_plus.append(meanlog2r)

binwidth = 0.001
lsl.createPrintAndSaveHistogram(data10_12,
                                "full_segmentation_histograms/data10_12.txt",
                                binwidth)
lsl.createPrintAndSaveHistogram(data13_20,
                                "full_segmentation_histograms/data13_20.txt",
                                binwidth)
lsl.createPrintAndSaveHistogram(data21_50,
                                "full_segmentation_histograms/data21_50.txt",
                                binwidth)
lsl.createPrintAndSaveHistogram(data51_500,
                                "full_segmentation_histograms/data51_500.txt",
                                binwidth)
lsl.createPrintAndSaveHistogram(
    data501_5000, "full_segmentation_histograms/data501_5000.txt", binwidth)
lsl.createPrintAndSaveHistogram(
    data5001_50000, "full_segmentation_histograms/data5001_50000.txt",
    binwidth)
Ejemplo n.º 5
0
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 29 16:18:22 2016

@author: lpsmith
"""

import lucianSNPLibrary as lsl

prune = open("prunebreaks.txt", "r")
data = []
for line in prune:
    point = int(line)
    data.append(point)

print max(data)
binwidth = 1
label = "position"

lsl.createPrintAndSaveHistogram(data, "mary_out.txt", binwidth, xdata=label)
Ejemplo n.º 6
0
            for validity in valid_out:
                allinvalid.write(str(validity) + "\t")
            length = int(segid[2]) - int(segid[1])
            invalidlengths[int(segid[3])].append(length)
            allinvalid.write(str(length) + "\n")
            
    outfile.close()

allvalid.close()
allinvalid.close()

for n in range(8):
    if n==1:
        continue
    if len(validlengths[n]) > 10:
        print("Valid lengths for copy number call", str(n))
        print("  number of calls:", str(len(validlengths[n])))
        print("  mean: ", str(np.mean(validlengths[n])))
        print("  median: ", str(np.median(validlengths[n])))
        print("  stdev: ", str(np.std(validlengths[n])))
        if len(validlengths[n]) > 30:
            x = lsl.createPrintAndSaveHistogram(validlengths[n], "", 10000)
    if len(invalidlengths[n]) > 10:
        print("Invalid lengths for copy number call", str(n))
        print("  number of calls:", str(len(invalidlengths[n])))
        print("  mean: ", str(np.mean(invalidlengths[n])))
        print("  median: ", str(np.median(invalidlengths[n])))
        print("  stdev: ", str(np.std(invalidlengths[n])))
        if len(invalidlengths[n]) > 30:
            y = lsl.createPrintAndSaveHistogram(invalidlengths[n], "", 10000)
Ejemplo n.º 7
0
                    if bal_calls[chr][b_seg] == "Balanced":
                        allbal.append(rawA)
#                        print("Balanced:", chr, ub_seg)
                    else:
                        allunbal.append(rawA)


#                        print("Unbalanced:", chr, ub_seg)

allbal = []
allunbal = []
files = []
for (__, __, f) in walk(CNdir):
    files += f
for f in files:
    if "nonint" not in f:
        continue
    (patient, sample, ploidy) = f.split("_")[0:3]
    if onlysomepatients and patient not in somepatients:
        continue
    unbal_calls = readAmbiguousCallsFromASCAT(f)
    bal_calls = lsl.readBalancedCalls(patient, sample)
    print("Comparing", patient, sample, ploidy)
    compareAndReport(unbal_calls, bal_calls, patient, sample, ploidy, allbal,
                     allunbal)

print("All balanced values:")
lsl.createPrintAndSaveHistogram(allbal, "", 0.001)
print("All unbalanced values:")
x = lsl.createPrintAndSaveHistogram(allunbal, "", 0.001, axis=[0, 5, 0])
Ejemplo n.º 8
0
def combineTwoBafs(patient1, patient2):
    bafrawdata = {}
    bafwt = {}
    brd = {}
    allwtbafs = []
    for patient in (patient1, patient2):
        brd[patient] = {}
        bafwt[patient] = {}
        if BAF_links:
            bafnormal = readlink(BAF_dir + patient + "_Normal_BAF.txt")
        else:
            bafnormal = BAF_dir + patient + "_Normal_BAF.txt"
        if not(isfile(bafnormal)):
            print("ERROR:  no Normal BAF file found for patient", patient)
            return ({}, {})
        bafnormal = open(bafnormal, "r")
        print("Reading BAF normal data for patient", patient)
        for line in bafnormal:
    #        if (line.find("cnvi") != -1):
    #            continue
            lvec = line.split()
            if line.find("Chr") != -1:
                continue
            try:
                value = float(lvec[3])
            except:
                continue
            if (value < 0.1 or value > 0.9):
                continue
            allwtbafs.append(value)
            if (value < bafWtLow or value > bafWtHigh):
                continue
            chr = lvec[1].split('"')[1]
            pos = int(lvec[2])
            if chr not in brd[patient]:
                brd[patient][chr] = {}
            if chr not in bafwt[patient]:
                bafwt[patient][chr] = {}
                #print("Adding", chr, "to patient", patient)
            brd[patient][chr][pos] = {}
            bafwt[patient][chr][pos] = value
        bafnormal.close()
#        print("Number of 0.5 BAFs for patient", patient, ":")
#        for chr in brd[patient]:
#            print(len(brd[patient][chr]))
    if showgraphs:
        compareNormalBafs(bafwt, patient1, patient2)
        lsl.createPrintAndSaveHistogram(allwtbafs, "", 0.01)

    todelete = []
    for chr in brd[patient1]:
        for pos in brd[patient1][chr]:
            if pos not in brd[patient2][chr]:
                todelete.append((chr, pos))
    for (chr, pos) in todelete:
        del brd[patient1][chr][pos]
    bafrawdata = brd[patient1]
#    print("Number of 0.5 BAFs for both combined:")
#    for chr in bafrawdata:
#        print(len(bafrawdata[chr]))

    allsamples = []
    for patient in [patient1, patient2]:
        labels = []
        if BAF_links:
            baffile = readlink(BAF_dir + patient + "_BAF.txt")
        else:
            baffile = BAF_dir + patient + "_BAF.txt"
        if not(isfile(baffile)):
            print("ERROR:  no BAF file found for patient", patient)
            bafrawdata = {}
            return
        print("Reading BAF sample data for patient", patient)
        baffile = open(baffile, "r")
        allbafs = {}
        allbafs["cnvi"] = []
        allbafs["normal"] = []
        for line in baffile:
            lvec = line.split()
            if line.find("Chr") != -1:
                labels = lvec
                for lv in range(2,len(lvec)):
                    allsamples.append(lvec[lv].split('"')[1])
                continue
            chr = lvec[1].split('"')[1]
            pos = int(lvec[2])
            if chr not in bafrawdata:
                continue
            if pos not in bafrawdata[chr]:
                continue
            for p in range(3,len(lvec)):
                sample = labels[p-1].split('"')[1]
                try:
                    bafrawdata[chr][pos][sample] = float(lvec[p])
                    if line.find("cnvi") != -1:
                        allbafs["cnvi"].append(float(lvec[p]))
                    else:
                        bafrawdata[chr][pos][sample] = float(lvec[p])
                        allbafs["normal"].append(float(lvec[p]))
                except:
                    continue
        baffile.close()
    outfile = open(outdir + "two_patients_input.txt", "w")
    outfile.write("Chr\tpos")
    for sample in allsamples:
        outfile.write("\t" + sample)
    outfile.write("\n")
    for chr in bafrawdata:
        for pos in bafrawdata[chr]:
            outfile.write(chr + "\t" + str(pos))
            for sample in allsamples:
                outfile.write("\t")
                if sample in bafrawdata[chr][pos]:
                    outfile.write(str(bafrawdata[chr][pos][sample]))
                else:
                    outfile.write("NA")
            outfile.write("\n")
    bafwt = bafwt[patient1]
    return bafrawdata, bafwt, allbafs, allwtbafs, allsamples
Created on Thu Jul  7 10:33:59 2016

@author: lpsmith
"""

from __future__ import division
from os import walk

import lucianSNPLibrary as lsl
import numpy

# read the filtered data that compares Xiaohong's segmentation data with raw SNP data

#filenames = ["1049_20780_avglog2rs.txt", "1049_20782_avglog2rs.txt"]
filename = "diseqs.txt"
all_data = []

file = open(filename, "r")
id = 0
all_data = []
for line in file:
    id += 1
    all_data = numpy.array(map(float, line.rstrip().split()))
    #binwidth = (max(all_data) - min(all_data))/100
    #binwidth = pow(10,int(numpy.floor(numpy.log10(abs(binwidth)))))
    binwidth = 0.001
    lsl.createPrintAndSaveHistogram(all_data,
                                    "diseq_" + str(id) + ".txt",
                                    binwidth,
                                    xdata="diseq")
Ejemplo n.º 10
0
    for line in segmented_file:
        if (line.find("chr") != -1):
            continue
        this_line = patient + "\t" + sample + "\t" + line
        (chr, start, end, xLog2r, call, nlog2r, log2r, stdev) = line.rstrip().split()
        chr = int(chr)
        if (chr != 9):
            continue
        start = int(start)
        if (start > 21995301):
            continue
        if (end=="inf"):
            end = 30000000 #Greater than the end of the gene
        end = int(end)
        if (end < 21967752):
            continue
        all_data.append(float(log2r))
        if call == "Loss":
            loss_data.append(float(log2r))
        elif call == "Double_d":
            double_loss_data.append(float(log2r))
        elif call == "wt":
            wt_data.append(float(log2r))
        outfile.write(this_line)

outfile.close()
lsl.createPrintAndSaveHistogram(double_loss_data, "short_segments/p16_double" + rejoin + ".txt", 0.001, axis=(-3.5, 1.5, 0))
lsl.createPrintAndSaveHistogram(loss_data, "short_segments/p16_loss" + rejoin + ".txt", 0.001, axis=(-3.5, 1.5, 0))
lsl.createPrintAndSaveHistogram(wt_data, "short_segments/p16_wt" + rejoin + ".txt", 0.001, axis=(-3.5, 1.5, 0))
lsl.createPrintAndSaveHistogram(all_data, "short_segments/p16_all" + rejoin + ".txt", 0.001, axis=(-3.5, 1.5, 0))
Ejemplo n.º 11
0
    for f in flist:
        if f.find(".spstats") == -1:
            continue
        (patient, sample, tag) = f.split("_")
        statfile = open(directory + f, "r")
        for line in statfile:
            if line.find("expands") != -1:
                continue
            if line.find("Mean") != -1:
                continue
            splitline = line.split()
            if len(splitline) < 2:
                continue
            noisevals.append(float(splitline[1]))
        statfile.close()
    if (len(noisevals) > 0):
        outfile.write(directory + "\t")
        outfile.write(str(numpy.average(noisevals)) + "\t")
        outfile.write(str(numpy.std(noisevals)) + "\t")
        outfile.write(str(numpy.median(noisevals)) + "\t")
        outfile.write(str(numpy.max(noisevals)) + "\t")
        outfile.write(str(numpy.min(noisevals)) + "\n")
        lsl.createPrintAndSaveHistogram(noisevals,
                                        "noiseout.txt",
                                        0.01,
                                        xdata="noise")
outfile.close()
of = open("noisevals.txt", "w")
of.write(str(noisevals))
of.close()
            else:
                print "Unknown call ", call

rangestr = "_"
if (use_max):
    rangestr += "only_" + str(nsamples_min) + "-" + str(nsamples_max) + "_"
if (use_length):
    rangestr = "_only_" + str(length_min) + "-" + str(length_max) + "_"

if (use_baf):
    print "Double-loss histograms:"
    index = 0
    combined_data = []
    for dataset in double_loss_data:
        lsl.createPrintAndSaveHistogram(
            dataset, output_directory + "double_loss_hist" + rangestr +
            str(index) + ".txt", g_binwidth)
        combined_data += dataset
        index += 1
    lsl.createPrintAndSaveHistogram(
        combined_data,
        output_directory + "double_loss_hist" + rangestr + "all.txt",
        g_binwidth)

    combined_data = []
    print "Loss histograms:"
    index = 0
    for dataset in loss_data:
        lsl.createPrintAndSaveHistogram(
            dataset,
            output_directory + "loss_hist" + rangestr + str(index) + ".txt",
Ejemplo n.º 13
0
            for call in calls:
                overview_out.write("\t" +
                                   str(overview_bases[sample][analysis][call]))
            writeDerivedStatistic(overview_out, overview, sample, analysis,
                                  ["TP", "TN"], ["FP", "FN"])
            writeDerivedStatistic(overview_out, overview_bases, sample,
                                  analysis, ["TP", "TN"], ["FP", "FN"])
            overview_out.write("\n")
    overview_out.close()


(Xiaohong_segments, X_totsca) = readAllXiaohongSegmentation()

files = []
for (__, __, f) in walk(BAF_dir):
    files += f
for f in files:
    if f.find("_Normal_BAF.txt") == -1:
        continue
    patient = f.split("_")[0]
    if (onlysomepatients and patient not in somepatients):
        continue

    bafrawdata, bafwt = readBafNormal(patient)
    if (len(bafrawdata) == 0):
        continue
    readBafSamples(patient, bafrawdata)

lsl.createPrintAndSaveHistogram(allbafs['1m'], "1M BAFs", .01)
lsl.createPrintAndSaveHistogram(allbafs['25m'], "2.5M BAFs", .01)
        if intA == 2:
            data_22.append(log2r)
        elif intA == 3:
            data_23.append(log2r)
    elif intB == 3:
        if intA == 3:
            data_33.append(log2r)

rangestr = "_" + str(nsamples_min) + "-" + str(nsamples_max) + "_"

thisaxis = [-3.5, 1.5, 0]

print "Double-loss histograms:"
lsl.createPrintAndSaveHistogram(
    double_loss_data,
    "ASCAT_smoothed_histograms/double_loss_hist_a" + rangestr + ".txt",
    g_binwidth,
    axis=[-3.5, 1.5, 0])

print "Loss histograms:"
lsl.createPrintAndSaveHistogram(loss_data,
                                "ASCAT_smoothed_histograms/loss_hist_a" +
                                rangestr + "all.txt",
                                g_binwidth,
                                axis=thisaxis)

print "WT histograms:"
lsl.createPrintAndSaveHistogram(wt_data,
                                "ASCAT_smoothed_histograms/wt_hist_a" +
                                rangestr + "all.txt",
                                g_binwidth,
        if f.find("LOH") != -1:
            readXiaohongWGSLOHFile(Xdir_WGS + f, Xiaohong_segments, totsca)
        else:
            readXiaohongCopynumFile(Xdir_WGS + f, Xiaohong_segments, totsca)
    files = []
    for (__, __, f) in walk(Xdir_1M):
        files += f
    for f in files:
        if f.find("read") != -1:
            continue
        if f.find("LOH") != -1:
            readXiaohong1MLOHFile(Xdir_1M + f, Xiaohong_segments, totsca)
        else:
            readXiaohongCopynumFile(Xdir_1M + f, Xiaohong_segments, totsca)
    return Xiaohong_segments, totsca


Xsegs, totsca = readAllXiaohongSegmentation()
alldiffs = []
for patient in Xsegs:
    for sample in Xsegs[patient]:
        for chr in Xsegs[patient][sample]:
            Xsegs[patient][sample][chr].sort()
            for i in range(1, len(Xsegs[patient][sample][chr])):
                endlast = Xsegs[patient][sample][chr][i - 1][1]
                startnext = Xsegs[patient][sample][chr][i][0]
                if endlast < startnext:
                    alldiffs.append(numpy.log10(startnext - endlast))

lsl.createPrintAndSaveHistogram(alldiffs, "", 0.01)
                loss_from_doubled_data.append(avg_log2r)
            else:
                loss_data.append(avg_log2r)
            #loss_data.append(avg_log2r)
        elif (call == "wt"):
            wt_data.append(avg_log2r)
        elif (call == "Gain"):
            gain_data.append(avg_log2r)
        elif (call == "Balanced_gain"):
            balanced_gain_data.append(avg_log2r)
        else:
            print "Unknown call ", call

    lsl.createPrintAndSaveHistogram(double_loss_from_doubled_data,
                                    outdir + str(patient) + "_" + str(sample) +
                                    "_smoothhist.txt",
                                    binwidth,
                                    show=False)

print "Double-loss from doubled genomes histogram:"
lsl.createPrintAndSaveHistogram(double_loss_from_doubled_data,
                                outdir + "double_loss_from_doubled_hist" +
                                srange + ".txt",
                                binwidth,
                                axis=(-3.5, 1.5, 0))

print "Loss from doubled genomes histogram:"
lsl.createPrintAndSaveHistogram(loss_from_doubled_data,
                                outdir + "loss_from_doubled_hist" + srange +
                                ".txt",
                                binwidth,
         intersection_file.write(patient)
         intersection_file.write("\t" + sample)
         intersection_file.write("\t" + chrom)
         intersection_file.write("\t" + str(pos))
         intersection_file.write("\n")
     elif compare == "dip_CNLOH":
         dip_CNLOH_VAFs.append(VAF)
 intersection_file.close()
 if just_intersection:
     continue
 if not justdip:
     #print("All VAFs for patient", patient, "sample", sample, ":")
     hist = lsl.createPrintAndSaveHistogram(allVAFs,
                                            VAFpngdir + patient + "_" +
                                            sample + "_VAF_hist",
                                            0.001,
                                            xdata="VAF",
                                            show=runLocally,
                                            savefig=True,
                                            axis=(0, 1.1, 0))
     #print("VAFs for positions called 1/1 in diploid and 2/2 in tetraploid,", patient, "sample", sample, ":")
     hist = lsl.createPrintAndSaveHistogram(twovfour_VAFs,
                                            VAF2v4dir + patient + "_" +
                                            sample + "_2v4_VAF_hist",
                                            0.001,
                                            xdata="VAF",
                                            show=runLocally,
                                            savefig=True,
                                            axis=(0, 1.1, 0))
     #print("VAFs for positions called 01 in diploid but more in tetraploid,", patient, "sample", sample, ":")
     hist = lsl.createPrintAndSaveHistogram(onevtwo_VAFs,
                                            VAF1v2dir + patient + "_" +
Ejemplo n.º 18
0
                mismatchkeys.append(key)
            else:
                matchkeys.append(key)
        for key in matchkeys:
            outfile.write("\t" + str(key))
        for key in mismatchkeys:
            outfile.write("\t" + str(key))
        outfile.write("\n")
        for (patient, sample) in allcomparisons:
            comparison = allcomparisons[(patient, sample)]
            outfile.write(patient + "\t" + sample)
            for key in matchkeys:
                if key in comparison:
                    outfile.write("\t" + str(comparison[key]))
                else:
                    outfile.write("\t0")
            for key in mismatchkeys:
                if key in comparison:
                    outfile.write("\t" + str(comparison[key]))
                else:
                    outfile.write("\t0")
            outfile.write("\n")
        outfile.close()

#        lengthvec = [20, 50, 100, 1000, 10000, 100000, 1000000]
#        seg1binnedlengths = binLengths(lengthvec, seg1lengths)
#        seg2binnedlengths = binLengths(lengthvec, seg2lengths)
        lsl.createPrintAndSaveHistogram(numpy.log10(seg1lengths), comparison_dir + file1 + "_seglengths.txt", 0.01, xdata="Segment lengths", axis=(), show=True)
        lsl.createPrintAndSaveHistogram(numpy.log10(seg2lengths), comparison_dir + file2 + "_seglengths.txt", 0.01, xdata="Segment lengths", axis=(), show=True)

Ejemplo n.º 19
0
def processEvidence(evidence, balanced_evidence, osegs, allsamples):
    good_samples = {}
    bad_samples = {}
    missed_samples = {}
    good_sca = {}
    bad_sca = {}
    missed_sca = {}

#    ev_ratios = []
#    balev_ratios = []

    #For information:
    balanced_percs = []
    unbalanced_percs = []
    crosscheck_percs = []
    allbal_percs = []
    balpercs_crosspatient = []
    balpercs_inpatient = []
    unbalpercs_crosspatient = []
    unbalpercs_inpatient = []
    for sample in allsamples:
        good_samples[sample] = 0
        bad_samples[sample] = 0
        missed_samples[sample] = 0
        good_sca[sample] = set()
        bad_sca[sample] = set()
        missed_sca[sample] = set()
    good_samples["overall"] = 0
    bad_samples["overall"] = 0
    missed_samples["overall"] = 0
    good_sca["overall"] = set()
    bad_sca["overall"] = set()
    missed_sca["overall"] = set()
    for chr in evidence:
        for isegrange in evidence[chr]:
            segpercs = []
            minnbaf = 100000000
            chrsegrange = (chr, isegrange[0], isegrange[1])
#            nbases = isegrange[1] - isegrange[0]
            for segpair in evidence[chr][isegrange]:
                [match, antimatch] = evidence[chr][isegrange][segpair]
                tot = match+antimatch
#                ev_ratios.append(math.log(nbases/tot))
                if tot < 10:
                    continue
                minnbaf = min(minnbaf, tot)
                perc = match/tot
                if mirror_percentages and antimatch>match:
                    perc = antimatch/tot
                segpercs.append(perc)
                unbalanced_percs.append(perc)
                if (segpair[0] in onepatientsamples and segpair[1] in onepatientsamples) or (segpair[0] not in onepatientsamples and segpair[1] not in onepatientsamples):
                    unbalpercs_inpatient.append(perc)
                else:
                    unbalpercs_crosspatient.append(perc)
                if perc<0.95 or perc<0.05:
                    #print("bad:", chrsegrange)
                    bad_samples[segpair[0]] += 1
                    bad_samples[segpair[1]] += 1
                    bad_samples["overall"] += 1
                    bad_sca[segpair[0]].add(chrsegrange)
                    bad_sca[segpair[1]].add(chrsegrange)
                    bad_sca["overall"].add(chrsegrange)
                else:
                    #print("good:", chrsegrange)
                    good_samples[segpair[0]] += 1
                    good_samples[segpair[1]] += 1
                    good_samples["overall"] += 1
                    good_sca[segpair[0]].add(chrsegrange)
                    good_sca[segpair[1]].add(chrsegrange)
                    good_sca["overall"].add(chrsegrange)
            for segpair in balanced_evidence[chr][isegrange]:
                [match, antimatch] = balanced_evidence[chr][isegrange][segpair]
                tot = match+antimatch
#                balev_ratios.append(math.log(nbases/tot))
                if tot < 20:
                    continue
                minnbaf = min(minnbaf, tot)
                perc = match/tot
                if mirror_percentages and antimatch>match:
                    perc = antimatch/tot
                #print(match, antimatch, tot, perc)
                balanced_percs.append(perc)
                if (segpair[0] in onepatientsamples and segpair[1] in onepatientsamples) or (segpair[0] not in onepatientsamples and segpair[1] not in onepatientsamples):
                    balpercs_inpatient.append(perc)
                else:
                    balpercs_crosspatient.append(perc)
                unbal_samples = []
                for iseg in isegs[chr]:
                    if iseg[0] == isegrange[0] and subdir in iseg[2]:
                        unbal_samples = iseg[2][subdir]
                if segpair[0] in unbal_samples or segpair[1] in unbal_samples:
                    crosscheck_percs.append(perc)
                else:
                    allbal_percs.append(perc)
                if perc>=0.95 or perc <= 0.05:
                    #print("missed_:", chrsegrange)
                    missed_samples[segpair[0]] += 1
                    missed_samples[segpair[1]] += 1
                    missed_samples["overall"] += 1
                    missed_sca[segpair[0]].add(chrsegrange)
                    missed_sca[segpair[1]].add(chrsegrange)
                    missed_sca["overall"].add(chrsegrange)
    if showgraphs:
        print("All percent matches for all balanced-to-anything segments:")
        lsl.createPrintAndSaveHistogram(balanced_percs, "", .001)
        if twopatientcompare:
            print("Only in-patient percent matches for all balanced-to-anything segments:")
            lsl.createPrintAndSaveHistogram(balpercs_inpatient, "", .001)
            print("Only cross-patient percent matches for all balanced-to-anything segments:")
            lsl.createPrintAndSaveHistogram(balpercs_crosspatient, "", .001)

    #    print("Number of bases/useable SNPs for unbalanced segments:")
    #    lsl.createPrintAndSaveHistogram(ev_ratios, "", 0.1)
    #    print("Mean of unbalanced segment ratios:", numpy.mean(ev_ratios))
    #    print("Median of unbalanced segment ratios:", numpy.median(ev_ratios))
    #    print("Number of bases/useable SNPs for balanced segments:")
    #    lsl.createPrintAndSaveHistogram(balev_ratios, "", 0.1)
    #    print("Mean of balanced segment ratios:", numpy.mean(balev_ratios))
    #    print("Median of balanced segment ratios:", numpy.median(balev_ratios))

        print("Percent matches for all balanced-to-unbalanced checks:")
        lsl.createPrintAndSaveHistogram(crosscheck_percs, "", .001)
        print("Percent matches for all balanced-to-balanced checks:")
        lsl.createPrintAndSaveHistogram(allbal_percs, "", .001)

        print("All percent matches for unbalanced-to-unbalanced segments:")
        lsl.createPrintAndSaveHistogram(unbalanced_percs, "", .001)
        if twopatientcompare:
            print("Only in-patient percent matches for unbalanced-to-unbalanced segments:")
            lsl.createPrintAndSaveHistogram(unbalpercs_inpatient, "", .001)
            print("Only cross-patient percent matches for unbalanced-to-unbalanced segments:")
            lsl.createPrintAndSaveHistogram(unbalpercs_crosspatient, "", .001)

    return (good_samples, bad_samples, missed_samples, good_sca, bad_sca, missed_sca)
        scoreAnalysis(isegs, Xiaohong_segments[patient], sample, "Xiaohong")

    #writeSummary(isegs, patient, all_samples, all_analyses)
    #writeBalanceFiles(isegs, patient, all_samples)

unique_ratios = []
used_samples = []
for segpair in allratios:
    if segpair[0] in used_samples:
        continue
    if segpair[1] in used_samples:
        continue
    used_samples.append(segpair[0])
    used_samples.append(segpair[1])
    unique_ratios.extend(allratios[segpair])
lsl.createPrintAndSaveHistogram(unique_ratios,
                                ratiodir + "unique_ratios_why95_01", .01)

for segpair in allratios:
    outfile = open(ratiodir + segpair[0] + "_" + segpair[1], "w")
    for entry in allratios[segpair]:
        outfile.write(str(entry) + "\n")

oldratios = []
for segpair in allratios:
    if ("360" in segpair[0] or "672_" in segpair[0]):
        oldratios.extend(allratios[segpair])
lsl.createPrintAndSaveHistogram(oldratios, ratiodir + "oldratios", .01)

v1Mratios = []
v25Mratios = []
cross_ratios = []
Ejemplo n.º 21
0
             continue
         if line.find("Mean") != -1:
             continue
         splitline = line.split()
         if len(splitline) < 2:
             continue
         if seglengths[(patient, sample)] < avgseg:
             lownoisevals.append(float(splitline[1]))
         else:
             highnoisevals.append(float(splitline[1]))
     statfile.close()
 if (len(highnoisevals) > 0):
     print directory, "High:", numpy.average(highnoisevals), numpy.std(highnoisevals), numpy.median(highnoisevals), numpy.max(highnoisevals), numpy.min(highnoisevals)
     print directory, "Low:", numpy.average(lownoisevals), numpy.std(lownoisevals), numpy.median(lownoisevals), numpy.max(lownoisevals), numpy.min(lownoisevals)
     
     lsl.createPrintAndSaveHistogram(highnoisevals, "highnoiseout.txt", 0.01, xdata="noise")
     lsl.createPrintAndSaveHistogram(lownoisevals, "lownoiseout.txt", 0.01, xdata="noise")
     
     
     
     
     
     
     
     
     
     
     
     
     
     
Ejemplo n.º 22
0
        datum = []
        for entry in line:
            entry = int(entry)
            datum.append(entry)
        data.append(datum)

    # classify by distances

    ############################################################
    # write reports

    results = summarize(data, cisdists, transdists, nesteddists)

lsl.createPrintAndSaveHistogram(cisdists,
                                "Cis Distances",
                                0.5,
                                xdata="distance",
                                axis=(-20, 500, 0))
lsl.createPrintAndSaveHistogram(transdists,
                                "Trans Distances",
                                0.5,
                                xdata="distance",
                                axis=(-20, 500, 0))
lsl.createPrintAndSaveHistogram(nesteddists,
                                "Nested Distances",
                                0.5,
                                xdata="distance",
                                axis=(-20, 500, 0))

plt.ylim(0, 400)
plt.hist(cisdists, 100)