コード例 #1
0
def ConvertToBedViaR(cel_file, bpmapfile):
    '''function for t calling R to get Bed-like values'''

    # original script - minus redundant variables.
    # library(rMAT)
    # library(Biobase)
    #
    # expName    <- "2012-07-12_SIR2_IP";  # cell file name
    # bpmapFile  <- "Sc03b_MR_v04.bpmap";  # the mapping!
    # seqHeader  <- ReadBPMAPAllSeqHeader(bpmapFile);
    # arrayFile1 <- c("2012-07-12_SIR2_IP.CEL");
    # ScSet      <- BPMAPCelParser(bpmapFile, arrayFile1, verbose = FALSE, groupName = "Sc", seqName="chr");
    # data       <- list(chrNo = ScSet1@featureChromosome, probePos = ScSet1@featurePosition, MATScore = exprs(ScSet1));  # last parameter is the raw data.
    # write.table(data, file = paste(expName,"_exp1_AllData.txt",sep=''), append = FALSE, row.names = FALSE, sep = "\t");


    bedfile = StringUtils.rreplace(cel_file, 'CEL', 'BED', 2)
    bedfile += "like"

    print "input file: %s" % (cel_file)
    print "Will be writing out to %s" % (bedfile)
    print "importing rMat and Biobase libraries"

    importr('rMAT')
    importr('Biobase')

    # print "creating seqHeader"
    # robjects.r('seqHeader <- ReadBPMAPAllSeqHeader(\"' + bpmapfile + '\")')
    print "creating scSet"
    robjects.r('ScSet <- BPMAPCelParser(\"' +
         bpmapfile + '\", c(\"' +
         cel_file + '\"), verbose = FALSE, groupName = "Sc", seqName="chr")')
    print "creating data"
    robjects.r('data <- list(chrNo = ScSet@featureChromosome, probePos = ScSet@featurePosition, MATScore = exprs(ScSet))')    # last parameter is the raw data.
    print "writing table"
    robjects.r('write.table(data, file = \"' + bedfile + '\", append = FALSE, quote = FALSE, row.names = FALSE, sep = "\t")')
コード例 #2
0
    files = os.listdir(args.BEDlikefiles)

    starttime = time.time()
    num_probes = 2635714
    print "initializing array..."
    probeset = [[0.0 for y in xrange(num_probes)] for x in xrange(0, 2)]
    print "reading baseline from %s" % (args.Baselinefile)
    print "probeset[%i][%i]" % (len(probeset), len(probeset[0]))
    ReadBaseline(args.Baselinefile, probeset)
    blaverage = FindAverageProbeIntensity(probeset)
    print "baseline average probe intensity is:", blaverage
    print "done."
    # op = StringUtils.rreplace(sys.argv[1], 'BED', 'NORMAL', 1)
    for i, f in enumerate(files):
        filetime = time.time()
        of = StringUtils.rreplace(f, '.BEDlike', '.normalized.BEDlike', 1)
        bedprobes = [[0.0 for y in xrange(num_probes)] for x in xrange(0, 2)]
        print "determining average probe intensity from %s%s" % (args.BEDlikefiles, f)
        print "bedprobes[%i][%i]" % (len(bedprobes), len(bedprobes[0]))
        ReadBaseline("%s%s" % (args.BEDlikefiles, f), bedprobes)
        bedaverage = FindAverageProbeIntensity(bedprobes)
        print "average probe intensity is:", bedaverage
        scaling = float(bedaverage) / blaverage
        if scaling < 1:
            scaling = 1
        print "baseline needs to be scaled by a factor of", scaling
        ApplyBaseline(i + 1, "%s%s" % (args.BEDlikefiles, f), "%s%s" % (args.output_path, of), probeset, scaling)    # first file is first file, use zero for chr/pos
        print "File %i - %s processed in %f seconds" % (i + 1, f, time.time() - filetime)
    # ProduceStats(len(files) + 1, probeset)
    print 'Completed in %s seconds' % int((time.time() - starttime))
コード例 #3
0
def run(PARAM, metadata_file, tohide):
    '''reads metadata_file, and updates sample entries in database'''

    while not os.path.isfile(metadata_file):
        print "Unable to find file %s" % metadata_file
        sys.exit()

    print "Data being imported into database: ", PARAM.get('default_database')

    print "opening connection(s) to MongoDB..."
    mongo = Mongo_Connector.MongoConnector(PARAM.get("server"), PARAM.get("port"), PARAM.get('default_database'))

    print "processing %s..." % metadata_file

    f = open(metadata_file, 'r')
    firstrow = True
    collection_name = 'samples'
    missing = 0
    for line in f:
        sample_update = {}
        if not firstrow:
            a = line.split("\t")
            # # Find file name
            file_name = StringUtils.rreplace(a[0], ".CEL", ".normalized.waves", 1)
            print "Checking for file %s in database %s" % (file_name, PARAM.get('default_database'))
            # check if entry exists in database
            if mongo.find(collection_name, {"file_name":file_name}, {"_id":1}).count() > 0 :
                print "Entry exists; updating metadata for ", file_name
                # Update other parameters
                sample_update["exp_date"] = a[1]
                sample_update["strain_number"] = a[2]
                sample_update["strain_background"] = a[3]
                sample_update["mutations"] = a[4]
                sample_update["researcher"] = a[5]
                sample_update["type"] = a[6]
                sample_update["antibody"] = a[7]
                sample_update["catalog_number"] = a[8]
                sample_update["antibody_volume"] = a[9]
                sample_update["array_type"] = a[10]
                sample_update["array_lot_number"] = a[11]
                sample_update["protocol"] = a[12]
                sample_update["crosslinking_time"] = a[13]
                sample_update["pubmed_id"] = a[14]
                sample_update["sample_id"] = a[15]
                sample_update["comments"] = a[16]
                if a[16] == "\n":
                    sample_update["comments"] = ""
                if tohide:
                    sample_update['hide'] = True
                else:
                    sample_update['hide'] = False    # now that metadata is added, unhide sample

                # update entries
                mongo.update(collection_name, {"file_name":file_name}, {"$set": sample_update})
                print "\tFinished updating metadata for ", file_name
            else:
                print "NOT FOUND: ", file_name
                missing += 1
        else:
            firstrow = False
    print "Done all updates from metadata file ", metadata_file
    if missing > 0:
        print "%s files were unable to be updated as no matching file_name was found in the waves table." % missing
    mongo.close()
コード例 #4
0
def find_waves_in_spliced_genes(splicefile, wavesfile, output, autothresh, heisig, thresh):
    # body of function goes here

    chromosomes = {1:"I", 2:"II", 3:"III", 4:"IV", 5:"V", 6:"VI", 7:"VII", 8:"VIII", 9:"IX", 10:"X", 11:"XI", 12:"XII", 13:"XIII", 14:"XIV", 15:"XV", 16:"XVI"}

    waves = []

    print "Reading files..."
    f = open(wavesfile, 'r')
    for line in f:
        if line.startswith("#"):
            continue
        else:
            a = line.split("\t")
            wave = {}
            wave["chr"] = "chr" + a[0].replace("chr", "").upper()
            wave["pos"] = int(a[1])
            wave["stddev"] = int(a[2])
            wave["height"] = float(a[3])
            wave["used"] = False
            waves.append(wave)
    f.close()

    if(autothresh):
        if(heisig):
            # print "\nNow determining background levels for height of peaks"
            bins = 70    # based on max peak height of 7
            counts = [0] * bins
            thresh = [0] * bins
            for i in range(0, bins):
                thresh[i] = (i + 1) * 0.1
            for i in waves:
                # check all heights to determine background levels
                counts[int(i['height'] / 0.1)] += 1    # increment count where height1 is in bin of size 0.1
            # print "counts are: ", counts
            x = []
            y = []
            for i in range(10, 15):    # (0 to 9 correspond to heights 0 to 0.9, do not have)
                x.append(thresh[i])
                y.append(counts[i])
            # print "x is ", x
            # print "y is ", y
            slr = scipystats.linregress(x, y)
            slope = slr[0]
            intercept = slr[1]
            # print "slope: %s and intercept: %s for background peak height" % (slope, intercept)
            # find x-intercept, threshold for noise-signal
            xint = abs(intercept / slope)
            print "height threshold between noise and signal is ", xint
            thresh = round(xint, 2)
        else:
            # determine threshold for sigma
            bins = 300    # based on max sigma of 300
            counts = [0] * bins
            threshes = [0] * bins
            for i in range(0, bins):
                threshes[i] = (i + 1)
            for i in waves:
                # check all heights to determine background levels
                counts[int(i['stddev'])] += 1    # increment count where height1 is in bin of size 0.1
            # print "counts are: ", counts
            highest = 0
            ind = -1
            for i in range(0, bins):
                if counts[i] > highest:
                    highest = counts[i]
                    ind = i
            thresh = threshes[ind]
            print "sigma threshold between noise and signal is ", thresh

    # remove waves that don't meet threshold
    if(heisig):
        waves[:] = [x for x in waves if x['height'] > thresh]
    else:
        waves[:] = [x for x in waves if x['stddev'] > thresh]

    waves.sort(key = lambda x: (x['chr'], x['pos']))    # list sorted by position

    splice = []
    f = open(splicefile, 'r')
    for line in f:
        a = line.split("\t")
        region = {}
        region["gene"] = a[0]
        region["chr"] = "chr" + chromosomes[int(a[1])]
        if int(a[2]) == 1:    # forward
            # promoter
            region["p1"] = int(a[3]) - 301    # p1 is left boundary of promoter
            region["p2"] = int(a[3]) - 1    # p2 is right boundary of promoter
            # exon1
            region["e11"] = int(a[3])    # e11 is left boundary of exon1
            region["e12"] = int(a[4])    # e12 us right boundary of exon1
            # intron
            region["i1"] = int(a[5])
            region["i2"] = int(a[6])
            # exon2
            region["e21"] = int(a[7])
            region["e22"] = int(a[8])
            # 3' UTR
            region["u1"] = int(a[8]) + 1    # 3'UTR
            region["u2"] = int(a[8]) + 301
        else:    # reverse
            # promoter
            region["p1"] = int(a[4]) - 301
            region["p2"] = int(a[4]) - 1
            # exon1
            region["e11"] = int(a[4])
            region["e12"] = int(a[3])
            # intron
            region["i1"] = int(a[6])
            region["i2"] = int(a[5])
            # exon2
            region["e21"] = int(a[8])
            region["e22"] = int(a[7])
            # 3' UTR
            region["u1"] = int(a[7]) + 1
            region["u2"] = int(a[7]) + 301
        # counts in each region
        region['p'] = 0
        region['e1'] = 0
        region['i'] = 0
        region['e2'] = 0
        region['u'] = 0

        # splice site regions (+/- 100 bp)
        region['pe11'] = region['e11'] - 99    # promoter to exon1 left boundary
        region['pe12'] = region['e11'] + 101    # promoter to exon1 right boundary
        region['e1i1'] = region['i1'] - 99
        region['e1i2'] = region['i1'] + 101
        region['ie21'] = region['e21'] - 99
        region['ie22'] = region['e21'] + 101
        region['e2u1'] = region['u1'] - 99
        region['e2u2'] = region['u1'] + 101
        # splice site counts
        region['pe1'] = 0
        region['e1i'] = 0
        region['ie2'] = 0
        region['e2u'] = 0
        splice.append(region)

    splice.sort(key = lambda x: (x['chr'], x['p1']))



    print_queue = multiprocessing.Queue()
    # launch thread to read and process the print queue
    print_thread = PrintThread.StringWriter(print_queue, output, StringUtils.rreplace(os.path.basename(wavesfile), ".normalized.waves", "_" + str(thresh) + "_splice_summary.txt", 1), True, True)
    # print header
    print_queue.put("gene\tchr\tp1\tp2\te11\te12\ti1\ti2\te21\te22\tu1\tu2\tp\tpe1\te1\te1i\ti\tie2\te2\te2u\tu")
    print "Now finding peaks in each region..."
    w = 0
    b = 0


    while b < len(splice) and w < len(waves):
        if(waves[w]['chr'] == splice[b]['chr'] and (waves[w]['pos'] + 1 * waves[w]['stddev']) >= splice[b]['p1'] and (waves[w]['pos'] - 1 * waves[w]['stddev']) <= splice[b]['u2']):
            # print "found a wave in gene ", splice["gene"]
            # find which segment of gene it is present in
            # only use standard deviation threshold when at beginning of promoter and end of UTR
            if (waves[w]['pos'] + 1 * waves[w]['stddev']) >= splice[b]['p1'] and waves[w]['pos'] <= splice[b]['p2']:
                splice[b]['p'] += 1.0 / (splice[b]['p2'] - splice[b]['p1'])
            elif waves[w]['pos'] >= splice[b]['e11'] and waves[w]['pos'] <= splice[b]['e12']:
                splice[b]['e1'] += 1.0 / (splice[b]['e12'] - splice[b]['e11'])
            elif waves[w]['pos'] >= splice[b]['i1'] and waves[w]['pos'] <= splice[b]['i2']:
                splice[b]['i'] += 1.0 / (splice[b]['i2'] - splice[b]['i1'])
            elif waves[w]['pos'] >= splice[b]['e21'] and waves[w]['pos'] <= splice[b]['e22']:
                splice[b]['e2'] += 1.0 / (splice[b]['e22'] - splice[b]['e21'])
            elif waves[w]['pos'] >= splice[b]['u1'] and (waves[w]['pos'] - 1 * waves[w]['stddev']) <= splice[b]['u2']:
                splice[b]['u'] += 1.0 / (splice[b]['u2'] - splice[b]['u1'])

            if waves[w]['pos'] >= splice[b]['pe11'] and waves[w]['pos'] <= splice[b]['pe12']:
                splice[b]['pe1'] += 1.0 / (splice[b]['pe12'] - splice[b]['pe11'])
            elif waves[w]['pos'] >= splice[b]['e1i1'] and waves[w]['pos'] <= splice[b]['e1i2']:
                splice[b]['e1i'] += 1.0 / (splice[b]['e1i2'] - splice[b]['e1i1'])
            elif waves[w]['pos'] >= splice[b]['ie21'] and waves[w]['pos'] <= splice[b]['ie22']:
                splice[b]['ie2'] += 1.0 / (splice[b]['ie22'] - splice[b]['ie21'])
            elif waves[w]['pos'] >= splice[b]['e2u1'] and waves[w]['pos'] <= splice[b]['e2u2']:
                splice[b]['e2u'] += 1.0 / (splice[b]['e2u2'] - splice[b]['e2u1'])
            waves[w]['used'] = True
            w += 1
        elif waves[w]['chr'] > splice[b]['chr'] or ((waves[w]['pos'] - 1 * waves[w]['stddev']) > splice[b]['u2'] and waves[w]['chr'] == splice[b]['chr']):
            # wave is past bin, move to next bin
            # write to file
            # print "b: %i, w: %i, wave is AFTER, start: %s, end: %s, pos: %s, %s %s" % (b, w, splice[b]['start'], splice[b]['end'], waves[w]['pos'], splice[b]['chr'], waves[w]['chr'])
            print_queue.put(splice[b]['gene'] + "\t" + splice[b]['chr'] + "\t" + \
                            str(splice[b]['p1']) + "\t" + str(splice[b]['p2']) + "\t" + \
                            str(splice[b]['e11']) + "\t" + str(splice[b]['e12']) + "\t" + \
                            str(splice[b]['i1']) + "\t" + str(splice[b]['i2']) + "\t" + \
                            str(splice[b]['e21']) + "\t" + str(splice[b]['e22']) + "\t" + \
                            str(splice[b]['u1']) + "\t" + str(splice[b]['u2']) + "\t" + \
                            str(splice[b]['p']) + "\t" + str(splice[b]['pe1']) + "\t" + \
                            str(splice[b]['e1']) + "\t" + str(splice[b]['e1i']) + "\t" + \
                            str(splice[b]['i']) + "\t" + str(splice[b]['ie2']) + "\t" + \
                            str(splice[b]['e2']) + "\t" + str(splice[b]['e2u']) + "\t" + \
                            str(splice[b]['u']))
            b += 1
        else:
            # wave is before bin
            # print "b: %i, w: %i, wave is BEFORE, start: %s, end: %s, pos: %s, %s %s" % (b, w, splice[b]['start'], splice[b]['end'], waves[w]['pos'], splice[b]['chr'], waves[w]['chr'])
            w += 1

    while b < len(splice):
        # if ended because got to end of waves
        # need to write statistics for remaining bins
        # print "filling in remaining bins with 0s"
        print_queue.put(splice[b]['gene'] + "\t" + splice[b]['chr'] + "\t" + \
                            str(splice[b]['p1']) + "\t" + str(splice[b]['p2']) + "\t" + \
                            str(splice[b]['e11']) + "\t" + str(splice[b]['e12']) + "\t" + \
                            str(splice[b]['i1']) + "\t" + str(splice[b]['i2']) + "\t" + \
                            str(splice[b]['e21']) + "\t" + str(splice[b]['e22']) + "\t" + \
                            str(splice[b]['u1']) + "\t" + str(splice[b]['u2']) + "\t" + \
                            str(splice[b]['p']) + "\t" + str(splice[b]['pe1']) + "\t" + \
                            str(splice[b]['e1']) + "\t" + str(splice[b]['e1i']) + "\t" + \
                            str(splice[b]['i']) + "\t" + str(splice[b]['ie2']) + "\t" + \
                            str(splice[b]['e2']) + "\t" + str(splice[b]['e2u']) + "\t" + \
                            str(splice[b]['u']))
        b += 1

    # end printing
    if print_thread is None or not print_thread.is_alive():
        pass
    else:
        while print_queue.qsize() > 0:
            print "waiting on print_queue to empty", print_queue.qsize()
            time.sleep(1)
        print_thread.END_PROCESSES = True
        print_thread.f.close()
        # while not print_thread.is_closed():
        #    print "waiting for print_thread to close ", print_queue.qsize(), " ", print_thread.is_closed(), " ", print_thread.END_PROCESSES
        #    time.sleep(1)

    # quick summary statistics
    unassigned = 0
    for i in waves:
        if not i["used"]:
            unassigned += 1


    if(heisig):
        print "Height threshold applied to waves:", thresh
    else:
        print "Sigma threshold applied to waves:", thresh

    ln = 0    # total number of genes
    ctp = 0    # count of peaks in promoters
    ctpe1 = 0    # count of peaks in promoter/exon1 region
    cte1 = 0
    cte1i = 0
    cti = 0
    ctie2 = 0
    cte2 = 0
    cte2u = 0
    ctu = 0
    emptygenes = 0

    for line in splice:
        ln += 1
        ctp += line['p']
        ctpe1 += line['pe1']
        cte1 += line['e1']
        cte1i += line['e1i']
        cti += line['i']
        ctie2 += line['ie2']
        cte2 += line['e2']
        cte2u += line['e2u']
        ctu += line['u']
        if line['p'] == 0 and line['e1'] == 0 and line['i'] == 0 and line['e2'] == 0 and line['u'] == 0:
            # no waves in this gene
            emptygenes += 1
            ln -= 1    # dont use in calculating the average

    avgp = float(ctp) / ln
    avgpe1 = float(ctpe1) / ln
    avge1 = float(cte1) / ln
    avge1i = float(cte1i) / ln
    avgi = float(cti) / ln
    avgie2 = float(ctie2) / ln
    avge2 = float(cte2) / ln
    avge2u = float(cte2u) / ln
    avgu = float(ctu) / ln
    print "%s waves were not part of a spliced gene" % unassigned
    print "%s genes had no waves" % emptygenes
    print "Region\tAverage_num_waves"
    print "Prom\t%f" % avgp
    print "Exon_1\t%f" % avge1
    print "Intron\t%f" % avgi
    print "Exon_2\t%f" % avge2
    print "3'_UTR\t%f" % avgu
    print "\n"
    print "prom_exon1\t%f" % avgpe1
    print "exon1_intron\t%f" % avge1i
    print "intron_exon2\t%f" % avgie2
    print "exon2_utr\t%f" % avge2u
コード例 #5
0
def FindBaseline(file_name, normalize = False):
    '''Find the baseline - TODO: break this into smaller functions'''
    f = open(file_name, 'r')    # open file

    print "processing data file (" + file_name + ")..."
    first_line = True
    headers = []
    data = []
    for line in f:
        if (first_line):
            headers = line.split("\t")
            for h in range(len(headers)):
                headers[h] = headers[h].lower()
                if headers[h].find("\n") != -1:
                    headers[h] = headers[h].replace("\n", "")
            first_line = False
        else:
            a = line.split("\t")
            r = row(a[0], int(a[1]), float(a[2]))
            data.append(r)
    # find most common number

    f.close()

    if normalize:
        v_sum = 0
        hist = {}
        for x in range(len(data)):
            v_sum += data[x].value
            if hist.has_key(str(data[x].value)):
                hist[str(data[x].value)] += 1
            else:
                hist[str(data[x].value)] = 1
        v_avg = v_sum / len(data)
        print "v average = %f" % (v_avg)

        point = 0
        largest = 0
        for g, y in hist.iteritems():
            if y > largest:
                largest = y
                point = g
                # print "g, y (%s, %i)" % (g, y)
        # print "point, largest (%s, %i)" % (point, largest)
        point = float(point)
        for g in range(len(data)):
            v = data[g].value - point
            if v < 0:
                v = 0
            data[g].setv(v)
    # create wig file
    f_w_name = StringUtils.rreplace(file_name, '.BEDlike', '', 1)
    # f_w_name = StringUtils.rreplace(f_w_name, 'BED', 'WIG', 1)    # #should these two lines just replace .BEDlike with .WIG?
    trackname = os.path.basename(f_w_name)

    # print "Writing to %s" % (f_w_name)
    # f = open(f_w_name, 'w')    # open file


    current_chr = data[0].chromosome
    print "New Chromosome %s (%s)" % (current_chr, chr_yeast[current_chr])
    last_bp = 0
    last_ht = 0
    wigfile = WigFileThread.WigFileWriter(None)
    wigfile.start_wig_writer(os.path.dirname(f_w_name), os.path.basename(f_w_name), trackname)


    # for x in range (0, 5020):
    #    print "x=%i position=%i, value=%f" % (x, data[x].position + 1, data[x].value)


    x = 0
    a = 0
    # gc.disable()
    coverage_map = []
    l = len(data)
    x = 1
    block_left = 0
    while x < l:
        # print "x of len: %i/%i" % (x, l)
        if data[x].chromosome != current_chr:
            if len(coverage_map) > 0:
                wigfile.add_map(coverage_map, chr_yeast[current_chr], block_left)    # for yeast chromosome nomenclature (roman numeral)
                # wigfile.add_map(coverage_map, current_chr, block_left)
                coverage_map = []
            # TODO: taper off chromosome
            # TODO: taper "on" new chromosome
            print "New Chromosome %s (%s)" % (data[x].chromosome, chr_yeast[data[x].chromosome])
            current_chr = data[x].chromosome    # switch chromosomes,
        if data[x - 1].chromosome == data[x].chromosome:
            block_left = data[x - 1].position + 2    # shift by 1, and block starts after the zero.
            last_bp = data[x - 1].position + 1
        else:
            block_left = data[x].position + 2
            last_bp = data[x].position + 1

        # print "x=%i data=%i, value=%f" % (x, data[x].position + 1, data[x].value)
        while x < l and data[x].value >= 0 and data[x].chromosome == current_chr:
            diff = (data[x].position + 1) - last_bp
            # if diff > 6:
            #    diff = 6
            if (diff > 1):
                slope = float(data[x].value - last_ht) / (diff)    # slope between the two
                for y in range(1, diff):
                    coverage_map.append(round(last_ht + (slope * y), 2))
                coverage_map.append(round(data[x].value, 2))
            else:
                coverage_map.append(round(data[x].value, 2))
                a += 1
            last_bp = data[x].position + 1
            last_ht = data[x].value
            if x < l - 1 and (data[x + 1].value <= 0 or data[x + 1].chromosome != current_chr):
                diff = (data[x + 1].position + 1) - (data[x].position + 1)
                slope = float(0 - data[x].value) / (diff)
                for y in range(1, diff):
                    coverage_map.append(round(data[x].value + (slope * y), 2))
                last_ht = 0
            x += 1

        while x < l and data[x].value <= 0:
            x += 1
            last_ht = 0
        if len(coverage_map) > 0:
            wigfile.add_map(coverage_map, chr_yeast[current_chr], block_left)    # for yeast chromosome nomenclature (roman numeral)
            # print "writing map"
            # wigfile.add_map(coverage_map, current_chr, block_left)
            coverage_map = []
    # gc.enable()


    print "Closing Wigwriter.  This may take some time."
    wigfile.close_wig_writer()
    print "Wigwriter closed."
コード例 #6
0
def find_waves_in_promoter(orffile, wavesfile, output, autothresh, heisig):
    # body of function goes here

    chromosomes = {1:"I", 2:"II", 3:"III", 4:"IV", 5:"V", 6:"VI", 7:"VII", 8:"VIII", 9:"IX", 10:"X", 11:"XI", 12:"XII", 13:"XIII", 14:"XIV", 15:"XV", 16:"XVI"}

    waves = []

    print "Reading files..."
    f = open(wavesfile, 'r')
    for line in f:
        if line.startswith("#"):
            continue
        else:
            a = line.split("\t")
            wave = {}
            wave["chr"] = "chr" + a[0].replace("chr", "").upper()
            wave["pos"] = int(a[1])
            wave["stddev"] = int(a[2])
            wave["height"] = float(a[3])
            wave["used"] = False
            waves.append(wave)
    f.close()

    # automatically determine noise
    if(autothresh):
        if(heisig):
            # print "\nNow determining background levels for height of peaks"
            bins = 70    # based on max peak height of 7
            counts = [0] * bins
            thresh = [0] * bins
            for i in range(0, bins):
                thresh[i] = (i + 1) * 0.1
            for i in waves:
                # check all heights to determine background levels
                counts[int(i['height'] / 0.1)] += 1    # increment count where height1 is in bin of size 0.1
            # print "counts are: ", counts
            x = []
            y = []
            for i in range(10, 15):    # (0 to 9 correspond to heights 0 to 0.9, do not have)
                x.append(thresh[i])
                y.append(counts[i])
            # print "x is ", x
            # print "y is ", y
            slr = scipystats.linregress(x, y)
            slope = slr[0]
            intercept = slr[1]
            # print "slope: %s and intercept: %s for background peak height" % (slope, intercept)
            # find x-intercept, threshold for noise-signal
            xint = abs(intercept / slope)
            print "height threshold between noise and signal is ", xint
            thresh = round(xint, 2)
        else:
            # determine threshold for sigma
            bins = 300    # based on max sigma of 300
            counts = [0] * bins
            threshes = [0] * bins
            for i in range(0, bins):
                threshes[i] = (i + 1)
            for i in waves:
                # check all heights to determine background levels
                counts[int(i['stddev'])] += 1    # increment count where height1 is in bin of size 0.1
            # print "counts are: ", counts
            highest = 0
            ind = -1
            for i in range(0, bins):
                if counts[i] > highest:
                    highest = counts[i]
                    ind = i
            thresh = threshes[ind]
            print "sigma threshold between noise and signal is ", thresh
    else:
        if(heisig):
            usr_in = raw_input("What would you like to use as the minimum wave height? ")
            thresh = float(usr_in)
        else:
            usr_in = raw_input("What would you like to use as the minimum wave sigma? ")
            thresh = int(usr_in)
    # remove waves that don't meet threshold
    if(heisig):
        waves[:] = [x for x in waves if x['height'] > thresh]
    else:
        waves[:] = [x for x in waves if x['stddev'] > thresh]

    waves.sort(key = lambda x: (x['chr'], x['pos']))    # list sorted by position

    usr_in = raw_input("Distance upstream of TSS to check for waves: ")
    prom = int(usr_in)

    bed = []
    f = open(orffile, 'r')
    next(f)
    for line in f:
        a = line.split("\t")
        if int(a[1]) == 17:    # assumed mitochondrial chromosome
            pass
        else:
            region = {}
            region['gene'] = a[0]
            # print a[1]
            region["chr"] = "chr" + chromosomes[int(a[1])]
            # print "chromosome is: ", region["chr"]
            region["start"] = int(a[2])
#             if int(a[4]) == 1:
#                 region["start"] = int(a[2]) - prom
#                 region["end"] = int(a[2])
#             else:
#                 region["start"] = int(a[3])
#                 region["end"] = int(a[3]) + prom
            if int(a[4]) == 1:
                region["start"] = int(a[2])
                region["end"] = int(a[2]) + prom
            else:
                region["start"] = int(a[3]) - prom
                region["end"] = int(a[3])
            bed.append(region)

    bed.sort(key = lambda x: (x['chr'], x['start']))

    maxperbin = 0

    print_queue = multiprocessing.Queue()
    # launch thread to read and process the print queue
    # print "printing to: ", output + StringUtils.rreplace(os.path.basename(wavesfile), ".normalized.waves", "_" + str(thresh) + "_summary.txt", 1)
    print_thread = PrintThread.StringWriter(print_queue, output, StringUtils.rreplace(os.path.basename(wavesfile), ".waves", "_" + str(thresh) + "_promoter_summary.txt", 1), True, True)

    print "Now finding peaks in each promoter..."
    w = 0
    b = 0
    count = 0
    height = 0

    while b < len(bed) and w < len(waves):
        if(waves[w]['chr'] == bed[b]['chr'] and (waves[w]['pos'] + 1 * waves[w]['stddev']) >= bed[b]['start'] and (waves[w]['pos'] - 1 * waves[w]['stddev']) <= bed[b]['end']):
            # print "found a wave in bin ", b
            count += 1
            if count > maxperbin:
                maxperbin = count
            height += waves[w]['height']
            waves[w]['used'] = True
            w += 1
        elif waves[w]['chr'] > bed[b]['chr'] or ((waves[w]['pos'] + 1 * waves[w]['stddev']) > bed[b]['end'] and waves[w]['chr'] == bed[b]['chr']):
            # wave is past bin, move to next bin
            # write to file
            # print "b: %i, w: %i, wave is AFTER, start: %s, end: %s, pos: %s, %s %s" % (b, w, bed[b]['start'], bed[b]['end'], waves[w]['pos'], bed[b]['chr'], waves[w]['chr'])
            print_queue.put(bed[b]['gene'] + "\t" + bed[b]['chr'] + "\t" + str(bed[b]['start']) + "\t" + str(bed[b]['end']) + "\t" + str(count) + "\t" + str(height))
            b += 1
            count = 0
            height = 0
        else:
            # wave is before bin
            # print "b: %i, w: %i, wave is BEFORE, start: %s, end: %s, pos: %s, %s %s" % (b, w, bed[b]['start'], bed[b]['end'], waves[w]['pos'], bed[b]['chr'], waves[w]['chr'])
            w += 1

    while b < len(bed):
        # if ended because got to end of waves
        # need to write statistics for remaining bins
        # print "filling in remaining bins with 0s"
        start = bed[b]["start"]
        end = bed[b]["end"]
        chrom = bed[b]["chr"]

        count = 0
        height = 0

        print_queue.put(bed[b]['gene'] + "\t" + str(chrom) + "\t" + str(start) + "\t" + str(end) + "\t" + str(count) + "\t" + str(height))
        b += 1

    # end printing
    if print_thread is None or not print_thread.is_alive():
        pass
    else:
        while print_queue.qsize() > 0:
            print "waiting on print_queue to empty", print_queue.qsize()
            time.sleep(1)
        print_thread.END_PROCESSES = True
        print_thread.f.close()
        # while not print_thread.is_closed():
        #    print "waiting for print_thread to close ", print_queue.qsize(), " ", print_thread.is_closed(), " ", print_thread.END_PROCESSES
        #    time.sleep(1)


    print_queue = multiprocessing.Queue()
    # launch thread to read and process the print queue
    print_thread = PrintThread.StringWriter(print_queue, output, StringUtils.rreplace(os.path.basename(wavesfile), ".waves", "_" + str(thresh) + "_promoter_counts.txt", 1), True, True)


    # quick summary statistics
    print_queue.put("nwaves\tnbins\tavgBinSize")
    unassigned = 0
    un_height = 0
    for i in waves:
        if not i["used"]:
            unassigned += 1
            un_height += i["height"]
            # print i['height']

    # for each bin of height 0.5, determine % of waves not mapped
    total = [0] * 13
    unmap = [0] * 13
    for i in waves:
        total[int(i['height'] / 0.5) - 2] += 1    # TODO: this is almost certainly wrong!
        if not i['used']:
            unmap[int(i['height'] / 0.5) - 2] += 1    # TODO: most certainly wrong.
    # TODO: Print these values in some meaningful way.
    prop = [0.0] * 13
    for i in range(0, len(prop)):
        if total[i] != 0:
            prop[i] = float(unmap[i]) / float(total[i])
        else:
            prop[i] = 0
    print "Proportions of unused waves in each bin:"
    print "[1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5, 6.0, 6.5, 7.0]"
    print total
    print unmap
    print prop

    # print "%s waves were unassigned to a BED bin." % unassigned

    print "Max per bin is:", maxperbin
    print "Height threshold applied to waves:", thresh

    counts = [0] * (maxperbin + 1)
    # sizes = [[]] * (maxperbin + 1)
    sizes = [[] for x in range(0, maxperbin + 1)]
    f = open(output + StringUtils.rreplace(os.path.basename(wavesfile), ".waves", "_" + str(thresh) + "_promoter_summary.txt", 1), 'r', 0)

    ln = 0
    for line in f:
        ln += 1
        a = line.split("\t")
        # print "on line ", ln, " \n line is: ", line, " \nand value is: ", a[4]
        counts[int(a[4])] += 1
        sizes[int(a[4])].append(int(a[3]) - int(a[2]))
    f.close()
    print "%s waves of %s were unassigned to a BED bin (%f%%)." % (unassigned, len(waves), round(float(unassigned) / len(waves) * 100, 2))
    print "Average height of waves not part of a bin:", (un_height / unassigned)
    print_queue.put(str(unassigned) + "\t0\t0")
    print "Total number of bins:", len(bed)
    for i in range(0, maxperbin + 1):
        tot = 0
        # print len(sizes[i])
        for j in range(0, len(sizes[i])):
            tot += sizes[i][j]
            # print "size is: ", sizes[i][j]
        if len(sizes[i]) != 0:
            avg = float(tot) / len(sizes[i])
        else:
            avg = 0

        if i != 0:
            print_queue.put(str(i) + "\t" + str(counts[i]) + "\t" + str(avg))
        else:
            print "%i bins have no waves mapped to them (%s%%)" % (counts[i], float(counts[i]) / len(bed) * 100)
            print "%i bins have at least 1 wave mapped to them." % (len(bed) - counts[i])
            print_queue.put(str(i) + "\t" + str(counts[i]) + "\t" + str(avg))

    # end printing
    if print_thread is None or not print_thread.is_alive():
        pass
    else:
        while print_queue.qsize() > 0:
            print "waiting on print_queue to empty", print_queue.qsize()
            time.sleep(1)
        print_thread.END_PROCESSES = True
        print_thread.f.close()
コード例 #7
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("parameter_file", help = ".input parameterfile", type = str)
    parser.add_argument("-data_file", help = "override the source file in the parameter input file, containing data upon which waves will be called", type = str)
    parser.add_argument("-output_path", help = "override the path for output files", type = str)
    parser.add_argument("-noise_compensation", "-nc", help = "increase the amount of noise acceptable for wave calling - set to 16 for chip-chip, 1 for chip_seq.", type = int, default = 1)
    args = parser.parse_args()
    p = Parameters.parameter()

    param = create_param_obj(args.parameter_file)
    param.set("noise_compensation", args.noise_compensation)

    # override parameter file with cmdline args
    if args.data_file :
        param.set("input_file", args.data_file)    # override input_file
        # set file_name in param to be based on input file.
        ofile = StringUtils.rreplace(os.path.basename(args.data_file), '.wig', '', 1)
        param.set("file_name", ofile)    # override output file_name (.waves gets added later)
    if args.output_path:
        param.set("output_path", args.output_path)    # override output_path

    print "param file: ", args.parameter_file
    print "input_file: ", param.get("input_file")
    print "output_path: ", param.get("output_path")
    print "output_file_name: ", param.get("file_name")



    main(param)