def create_new_header(infile, mappings, outfile):
    """Create new header in BigWig, with UCSC chromosome names."""
    with pyBigWig.open(infile) as bw:
        if set(bw.chroms().keys()).issubset(mappings.values()):
            # If chromosome names are already UCSC, just rename input file to output name.
            # Exit with status 0 since this is normal behavior.
            os.rename(infile, outfile)
            sys.exit(0)

        hdr = [(mappings[chrom], length) for chrom, length in bw.chroms().items() if chrom in mappings]

        if not hdr:
            msg = "Neither of the chromosomes in the input file has a valid UCSC pair. No mapping will be done."
            print(warning(msg))
            os.rename(infile, outfile)
            sys.exit(0)

        seq_num = 0
        with pyBigWig.open(outfile, 'w') as bw_output:
            bw_output.addHeader(hdr)
            for chrom, length in bw.chroms().items():
                ints = bw.intervals(chrom, 0, length)
                if ints and chrom in mappings:
                    bw_output.addEntries([mappings[chrom]] * len(ints),
                                         [x[0] for x in ints],
                                         ends=[x[1] for x in ints],
                                         values=[x[2] for x in ints])
                elif chrom not in mappings:
                    seq_num += 1
                    print('UCSC chromosome/conting mapping for {} is missing'.format(chrom))

        if seq_num > 0:
            print(warning("UCSC chromosome/conting mapping for {} sequence(s) is missing. "
                          "This sequence(s) will not be included in the bigWig file.".format(seq_num)))
def getChromSizes(bigwigFilesList):
    """
    Get chromosome sizes from bigWig file with pyBigWig

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()

    Chromosome name(s) and size(s).
    >>> assert(getChromSizes([test.bwFile1, test.bwFile2]) == ([('3R', 200)], set([])))
    """
    # check that the path to USCS bedGraphToBigWig as set in the config
    # is installed and is executable.

    def print_chr_names_and_size(chr_set):
        sys.stderr.write("chromosome\tlength\n")
        for name, size in chr_set:
            sys.stderr.write("{0:>15}\t{1:>10}\n".format(name, size))

    bigwigFilesList = bigwigFilesList[:]

    common_chr = set()
    for fname in bigwigFilesList:
        fh = pyBigWig.open(fname)
        common_chr = common_chr.union(set(fh.chroms().items()))
        fh.close()

    non_common_chr = set()
    for bw in bigwigFilesList:
        _names_and_size = set(pyBigWig.open(bw).chroms().items())
        if len(common_chr & _names_and_size) == 0:
            #  try to add remove 'chr' from the chromosme name
            _corr_names_size = set()
            for chrom_name, size in _names_and_size:
                if chrom_name.startswith('chr'):
                    _corr_names_size.add((chrom_name[3:], size))
                else:
                    _corr_names_size.add(('chr' + chrom_name, size))
            if len(common_chr & _corr_names_size) == 0:
                message = "No common chromosomes found. Are the bigwig files " \
                          "from the same species and same assemblies?\n"
                sys.stderr.write(message)
                print_chr_names_and_size(common_chr)

                sys.stderr.write("\nand the following is the list of the unmatched chromosome and chromosome\n"
                                 "lengths from file\n{}\n".format(bw))
                print_chr_names_and_size(_names_and_size)
                exit(1)
            else:
                _names_and_size = _corr_names_size

        non_common_chr |= common_chr ^ _names_and_size
        common_chr = common_chr & _names_and_size

    if len(non_common_chr) > 0:
        sys.stderr.write("\nThe following chromosome names did not match between the the bigwig files\n")
        print_chr_names_and_size(non_common_chr)

    # get the list of common chromosome names and sizes
    return sorted(common_chr), non_common_chr
 def _generate_chunk_output_file(self, i=None):
     records = [
         ("chr1", 1, 2, 1.5),
         ("chr1", 2, 3, 4.5),
         ("chr1", 3, 4, 1.9),
         ("chr1", 4, 5, 0.45),
         ("chr2", 8, 9, 1.0),
         ("chr2", 9, 10, 6.7)
     ]
     fn = tempfile.NamedTemporaryFile(suffix=".bw").name
     _records = records[(i*3):(i*3)+3]
     assert len(_records) == 3
     ranges = {}
     for rec in _records:
         seqid = rec[0]
         pos = rec[1]
         ranges.setdefault(seqid, (sys.maxint, 0))
         ranges[seqid] = (min(ranges[seqid][0], pos),
                          max(ranges[seqid][1], pos))
     bw = pyBigWig.open(fn, "w")
     regions = [ (s, ranges[s][1]+1) for s in sorted(ranges.keys()) ]
     bw.addHeader(regions)
     bw.addEntries([rec[0] for rec in _records],
                   [rec[1]-1 for rec in _records],
                   ends=[rec[2]-1 for rec in _records],
                   values=[rec[3] for rec in _records])
     bw.close()
     return fn
Ejemplo n.º 4
0
def main():
    usage = 'usage: %prog [options] <in_bw_file> <out_h5_file>'
    parser = OptionParser(usage)
    parser.add_option('-v', dest='verbose', default=False, action='store_true')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide input BigWig and output HDF5.')
    else:
        bw_file = args[0]
        hdf5_file = args[1]

    # open files
    bw_in = pyBigWig.open(bw_file)
    h5_out = h5py.File(hdf5_file, 'w')

    # for each chromosome
    chrom_lengths = bw_in.chroms()
    for chrom in chrom_lengths:
        if options.verbose:
            print(chrom)

        # read values
        x = bw_in.values(chrom, 0, chrom_lengths[chrom], numpy=True).astype('float16')

        # write gzipped into HDF5
        h5_out.create_dataset(chrom, data=x, dtype='float16', compression='gzip', shuffle=True)

    # close files
    h5_out.close()
    bw_in.close()
Ejemplo n.º 5
0
def gerprunner():

    import pyBigWig

    b = pyBigWig.open("/scratch/ucgd/lustre/u1021864/serial/hg19.gerp.bw")
   # x = list(range(1,23)); x.append("X"), x.append("Y")

    input = sys.argv[1]
    iterator = JimFile(input)
    iterable = windower(iterator, chunker(1))
    cutoff = 1e-3

    def genchunks():
        nsmall = 0
        for i, chunk in enumerate(iterable):
            #if len(chunk) < 5:
            #    continue
            score = b.stats("chr"+chunk[0].chrom, chunk[0].start, chunk[-1].end)
            yield chunk, score[0]
            if i % 100000 == 0:
                print i, chunk[0].chrom, chunk[0].start, score
        print >>sys.stderr, nsmall, "removed for being too short"
        print >>sys.stderr, i, "total chunks"

    vcf_path = "/scratch/ucgd/lustre/u1021864/serial/clinvar-anno.vcf.gz"
    res = eval2(genchunks(), vcf_path,
        "/scratch/ucgd/lustre/u1021864/serial/esp-common.vcf.gz")
    print metrics(res[True], res[False], "gerp.auc.png")
Ejemplo n.º 6
0
    def testBigBed(self):
        fname = "http://www.encodeproject.org/files/ENCFF001JBR/@@download/ENCFF001JBR.bigBed"
        bb = pyBigWig.open(fname)
        assert(bb is not None)
        assert(bb.isBigWig() == 0)
        assert(bb.isBigBed() == 1)
        SQL = """table RnaElements 
"BED6 + 3 scores for RNA Elements data "
    (
    string chrom;      "Reference sequence chromosome or scaffold"
    uint   chromStart; "Start position in chromosome"
    uint   chromEnd;   "End position in chromosome"
    string name;       "Name of item"
    uint   score;      "Normalized score from 0-1000"
    char[1] strand;    "+ or - or . for unknown"
    float level;       "Expression level such as RPKM or FPKM. Set to -1 for no data."
    float signif;      "Statistical significance such as IDR. Set to -1 for no data."
    uint score2;       "Additional measurement/count e.g. number of reads. Set to 0 for no data."
    )
"""
        output = bb.SQL()
        if isinstance(output, bytes):
            output = output.decode('ASCII')
        assert(output == SQL)
        o = bb.entries('chr1',10000000,10020000)
        expected = [(10009333, 10009640, '61035\t130\t-\t0.026\t0.42\t404'), (10014007, 10014289, '61047\t136\t-\t0.029\t0.42\t404'), (10014373, 10024307, '61048\t630\t-\t5.420\t0.00\t2672399')]
        assert(o == expected)
        bb.close()
Ejemplo n.º 7
0
def mhsmidkernelsmooth(bamfile, bwfile, maxinsert=80, mininsert=1, paired=False, kernelsize=30):
    bamfor = Baminfo.Baminfo(bamfile)

    bw = pyBigWig.open(bwfile, "w")

    bw.addHeader(list(bamfor.chrlen.items()))

    for chromosome in bamfor.chrlen:

        end = bamfor.chrlen[chromosome]

        mhsmidcount = mhsbam.mhsmidcount(bamfile=bamfile, chromosome=chromosome, start=1,
                                         end=end, maxinsert=maxinsert, mininsert=mininsert, paired=paired)

        mhsmidsmoothed = kernelsmooth(mhsmidcount, 1, end, end, kernelsize)

        if mhsmidsmoothed:

            starts = list()

            values = list()

            for start in sorted(mhsmidsmoothed):
                starts.append(start)

                values.append(float(mhsmidsmoothed[start]))

            bw.addEntries(chromosome, starts=starts, values=values,
                          span=1, step=1)

    bw.close()
Ejemplo n.º 8
0
    def coverage_from_bigwig(self, bigwig_file, stepsize=100):

        """Return list of arrays describing the coverage of each genomicRegions from <bigwig_file>.
        
        *Keyword arguments:*
        
        - bigwig_file -- path to bigwig file
        - stepsize -- used stepsize
        
        *Output:*
        
        Class variable <coverage>: a list where the elements correspond to the GenomicRegion. The list elements give
        the number of reads falling into the GenomicRegion.
        
        """

        self.coverage = []

        bwf = pyBigWig.open(bigwig_file)

        for gr in self.genomicRegions:
            steps = int(len(gr) / stepsize)
            try:
                ds = bwf.stats(gr.chrom, gr.initial, gr.final, type="mean", nBins=steps)
                ds = [x if x else 0 for x in ds]
            except:
                ds = [0] * steps
            self.coverage.append(np.array(ds))

        bwf.close()
Ejemplo n.º 9
0
def coveragetobw(bamfile, bwfile, maxinsert, mininsert, paired=False):
    bamfor = Baminfo.Baminfo(bamfile)

    bw = pyBigWig.open(bwfile, "w")

    bw.addHeader(list(bamfor.chrlen.items()))

    for chromosome in bamfor.chrlen:

        end = bamfor.chrlen[chromosome]

        coveragecount = mhsbam.coveragecount(bamfile=bamfile, chromosome=chromosome, start=1,
                                             end=end, maxinsert=maxinsert, mininsert=mininsert, paired=paired)

        if coveragecount:

            starts = list()

            values = list()

            for start in sorted(coveragecount):
                starts.append(start)

                values.append(float(coveragecount[start]))

            bw.addEntries(chromosome, starts=starts, values=values,
                          span=1, step=1)

    bw.close()
Ejemplo n.º 10
0
def dhscutkernelsmooth(bamfile, bwfile, library='Duke', kernelsize=200):
    bamfor = Baminfo.Baminfo(bamfile)

    bw = pyBigWig.open(bwfile, "w")

    bw.addHeader(list(bamfor.chrlen.items()))

    for chromosome in bamfor.chrlen:

        end = bamfor.chrlen[chromosome]

        dhscut = dhsbam.dhcutcount(bamfile=bamfile, chromosome=chromosome, start=1,
                                   end=end, library=library)

        dhscutsmoothed = kernelsmooth(dhscut, 1, end, end, kernelsize)

        if dhscutsmoothed:

            starts = list()

            values = list()

            for start in sorted(dhscutsmoothed):
                starts.append(start)

                values.append(float(dhscutsmoothed[start]))

            bw.addEntries(chromosome, starts=starts, values=values,
                          span=1, step=1)

    bw.close()
Ejemplo n.º 11
0
    def doWrite2(self):
        '''
        Test all three modes of storing entries. Also test to ensure that we get error messages when doing something silly

        This is a modified version of the writing example from libBigWig
        '''
        chroms = ["1"]*6
        starts = [0, 100, 125, 200, 220, 230, 500, 600, 625, 700, 800, 850]
        ends = [5, 120, 126, 205, 226, 231]
        values = [0.0, 1.0, 200.0, -2.0, 150.0, 25.0, 0.0, 1.0, 200.0, -2.0, 150.0, 25.0, -5.0, -20.0, 25.0, -5.0, -20.0, 25.0]
        ofile = tempfile.NamedTemporaryFile(delete=False)
        oname = ofile.name
        ofile.close()
        bw = pyBigWig.open(oname, "w")
        bw.addHeader([("1", 1000000), ("2", 1500000)])

        #Intervals
        bw.addEntries(chroms[0:3], starts[0:3], ends=ends[0:3], values=values[0:3])
        bw.addEntries(chroms[3:6], starts[3:6], ends=ends[3:6], values=values[3:6])

        #IntervalSpans
        bw.addEntries("1", starts[6:9], values=values[6:9], span=20)
        bw.addEntries("1", starts[9:12], values=values[9:12], span=20)

        #IntervalSpanSteps, this should instead take an int
        bw.addEntries("1", 900, values=values[12:15], span=20, step=30)
        bw.addEntries("1", 990, values=values[15:18], span=20, step=30)

        #Attempt to add incorrect values. These MUST raise an exception
        try:
            bw.addEntries(chroms[0:3], starts[0:3], ends=ends[0:3], values=values[0:3])
            assert(1==0)
        except RuntimeError:
            pass
        try:
            bw.addEntries("1", starts[6:9], values=values[6:9], span=20)
            assert(1==0)
        except RuntimeError:
            pass
        try:
            bw.addEntries("3", starts[6:9], values=values[6:9], span=20)
            assert(1==0)
        except RuntimeError:
            pass
        try:
            bw.addEntries("1", 900, values=values[12:15], span=20, step=30)
            assert(1==0)
        except RuntimeError:
            pass

        #Add a few intervals on a new chromosome
        bw.addEntries(["2"]*3, starts[0:3], ends=ends[0:3], values=values[0:3])
        bw.close()
        #check md5sum, this is the simplest method to check correctness
        h = hashlib.md5(open(oname, "rb").read()).hexdigest()
        assert(h=="b1ca91d2ff42afdd2efa19a007c1ded4")
        #Clean up
        os.remove(oname)
Ejemplo n.º 12
0
def fetch_from_bigbed(path, chrom, start, end):
    import pyBigWig

    bed = pyBigWig.open(path)
    assert bed.isBigBed(), "Oops, for some reason I was expecting a bed file: {}".format(path)

    chrom = match_chrom_format(chrom, bed.chroms().keys())
    for cur_start, cur_end, bed_line in bed.entries(chrom, start, end):
        bed_line = bed_line.split()
        yield tx_from_bedfields([chrom, cur_start, cur_end] + bed_line)
Ejemplo n.º 13
0
    def __init__(self, wig_location):
        """
        Arguments
        ---------
        wig_location: Path to bigwig

        """
        self.wig_location = wig_location
        try:
            self.wig = pyBigWig.open(self.wig_location)
        except Exception as e:
            raise MocaException('Error reading wig file: {}'.format(e))
Ejemplo n.º 14
0
    def readValuesPyBigWig(self, reference, start, end):
        """
        Use pyBigWig package to read a BigWig file for the
        given range and return a protocol object.

        pyBigWig returns an array of values that fill the query range.
        Not sure if it is possible to get the step and span.

        This method trims NaN values from the start and end.

        pyBigWig throws an exception if end is outside of the
        reference range. This function checks the query range
        and throws its own exceptions to avoid the ones thrown
        by pyBigWig.
        """
        if not self.checkReference(reference):
            raise exceptions.ReferenceNameNotFoundException(reference)
        if start < 0:
            start = 0
        bw = pyBigWig.open(self._sourceFile)
        referenceLen = bw.chroms(reference)
        if referenceLen is None:
            raise exceptions.ReferenceNameNotFoundException(reference)
        if end > referenceLen:
            end = referenceLen
        if start >= end:
            raise exceptions.ReferenceRangeErrorException(
                reference, start, end)

        data = protocol.Continuous()
        curStart = start
        curEnd = curStart + self._INCREMENT
        while curStart < end:
            if curEnd > end:
                curEnd = end
            for i, val in enumerate(bw.values(reference, curStart, curEnd)):
                if not math.isnan(val):
                    if len(data.values) == 0:
                        data.start = curStart + i
                    data.values.append(val)
                    if len(data.values) == self._MAX_VALUES:
                        yield data
                        data = protocol.Continuous()
                elif len(data.values) > 0:
                    # data.values.append(float('NaN'))
                    yield data
                    data = protocol.Continuous()
            curStart = curEnd
            curEnd = curStart + self._INCREMENT

        bw.close()
        if len(data.values) > 0:
            yield data
Ejemplo n.º 15
0
def bedGraphToBigWig(chromSizes, bedGraphPath, bigWigPath, sort=True):
    """
    takes a bedgraph file, orders it and converts it to
    a bigwig file using pyBigWig.
    """

    from tempfile import NamedTemporaryFile
    from os import remove, system

    # Make a list of tuples for the bigWig header, this MUST be sorted identically to the bedGraph file
    sort_cmd = cfg.config.get('external_tools', 'sort')
    _file = NamedTemporaryFile(delete=False)
    for chrom, size in chromSizes:
        _file.write(toBytes("{}\t{}\n".format(chrom, size)))
    _file.close()
    system("LC_ALL=C {} -k1,1 -k2,2n {} > {}.sorted".format(sort_cmd, _file.name, _file.name))
    cl = []
    f = open("{}.sorted".format(_file.name))
    for line in f:
        chrom, chromLen = line.split()
        cl.append((chrom, int(chromLen)))
    f.close()
    remove(_file.name)
    remove("{}.sorted".format(_file.name))

    # check if the file is empty
    if os.stat(bedGraphPath).st_size < 10:
        import sys
        sys.stderr.write(
            "Error: The generated bedGraphFile was empty. Please adjust\n"
            "your deepTools settings and check your input files.\n")
        exit(1)

    if sort:
        # temporary file to store sorted bedgraph file
        _file = NamedTemporaryFile(delete=False)
        tempfilename1 = _file.name
        system("LC_ALL=C {} -k1,1 -k2,2n {} > {}".format(sort_cmd, bedGraphPath, tempfilename1))
        bedGraphPath = tempfilename1

    bw = pyBigWig.open(bigWigPath, "w")
    assert(bw is not None)
    # The lack of maxZooms will change the results a bit, perhaps the defaults are better
    bw.addHeader(cl, maxZooms=10)
    f = open(bedGraphPath)
    for line in f:
        interval = line.split()
        bw.addEntries([interval[0]], [int(interval[1])], ends=[int(interval[2])], values=[float(interval[3])])
    f.close()
    bw.close()

    if sort:
        remove(tempfilename1)
 def test_bigwig(self):
     import pyBigWig
     f = pyBigWig.open(self.bw_file)
     for i_rec, rec in enumerate(self.csv_records):
         seqid = re.sub('\"', "", rec[0])
         tpl = int(rec[1]) - 1
         s = int(f.values(seqid, tpl, tpl+1)[0])
         ipd_minus = (s % 65536) / 100.0
         ipd_plus = (s >> 16) / 100.0
         if rec[2] == "1":
             self.assertAlmostEqual(ipd_minus, float(rec[8]), places=1)
         else:
             self.assertAlmostEqual(ipd_plus, float(rec[8]), places=1)
Ejemplo n.º 17
0
def gather_bigwig(input_files, output_file):
    import pyBigWig
    chr_lengths = {}
    FileInfo = namedtuple("FileInfo", ("file_name", "seqid", "length"))
    files_info = []
    for file_name in input_files:
        log.info("Reading header info from {f}...".format(f=file_name))
        if op.getsize(file_name) == 0:
            continue
        bw_chunk = pyBigWig.open(file_name)
        for (seqid, length) in bw_chunk.chroms().iteritems():
            chr_lengths.setdefault(seqid, 0)
            chr_lengths[seqid] = max(length, chr_lengths[seqid])
        seqid_min = sorted(bw_chunk.chroms().keys())[0]
        files_info.append(FileInfo(file_name, seqid, bw_chunk.chroms()[seqid]))
        bw_chunk.close()
    if len(files_info) == 0:
        with open(output_file, "wb") as f:
            return output_file
    files_info.sort(lambda a,b: cmp((a.seqid, a.length), (b.seqid, b.length)))
    bw = pyBigWig.open(output_file, "w")
    regions = [ (s, chr_lengths[s]) for s in sorted(chr_lengths.keys())]
    bw.addHeader(regions)
    for file_info in files_info:
        log.info("Reading values from {f}...".format(f=file_info.file_name))
        bw_chunk = pyBigWig.open(file_info.file_name)
        for seqid in sorted(bw_chunk.chroms().keys()):
            seqids, starts, ends, values = [], [], [], []
            chr_max = bw_chunk.chroms()[seqid]
            for i, val in enumerate(bw_chunk.values(seqid, 0, chr_max)):
                if not math.isnan(val):
                    seqids.append(seqid)
                    starts.append(i)
                    ends.append(i+1)
                    values.append(val)
            bw.addEntries(seqids, starts, ends=ends, values=values)
        bw_chunk.close()
    bw.close()
    return output_file
Ejemplo n.º 18
0
def big_wig_corr(full, semi, regions):

    full = pyBigWig.open(full)
    semi = pyBigWig.open(semi)
    regions = pybedtools.BedTool(regions)

    full_result = []
    semi_result = []
    for interval in regions:
        gene_full_values = np.array(full.values(interval.chrom, interval.start, interval.stop))
        gene_semi_values = np.array(semi.values(interval.chrom, interval.start, interval.stop))

        filtered_gene_full_values = gene_full_values[~np.isnan(gene_full_values) & (gene_full_values != 0)]
        filtered_gene_semi_values = gene_semi_values[~np.isnan(gene_full_values) & (gene_full_values != 0)]
        filtered_gene_semi_values = np.nan_to_num(filtered_gene_semi_values)

        full_result.append(filtered_gene_full_values)
        semi_result.append(filtered_gene_semi_values)
    full_result = np.concatenate(full_result)
    semi_result = np.concatenate(semi_result)

    return stats.pearsonr(full_result, semi_result)
Ejemplo n.º 19
0
def convert_bigwig(mapping_table, bw_in_filename, bw_out_filename, verbose=False):
    """
    convert chromosome names of a bigwig file according to given mapping_table

    it checks which chromosome names that can correctly mapped, all other chromosomes are skipped
    """
    bw = pyBigWig.open(bw_in_filename)
    curr_chroms = bw.chroms()

    final_mapping_table = {}
    new_chroms = {}

    for c in curr_chroms:
        if c not in mapping_table:
            if (verbose):
                print("skip original chrom \'" + c + "\' - cannot be found in mapping table! Right GENOME & FROM_FORMAT?")
            continue
        final_mapping_table[c] = mapping_table[c]
        new_chroms[mapping_table[c]] = curr_chroms[c]

    if (len(new_chroms) <= 0):
        print("No chromosomes found for mapping! Wrong 'FROM_FORMAT'?")
        sys.exit(1)

    bw_out = pyBigWig.open(bw_out_filename, "w")
    bw_out.addHeader(list(new_chroms.items()))

    for c in final_mapping_table:
        c_int = bw.intervals(c)
        c_map = final_mapping_table[c]
        if verbose:
            print("convert chromosome: ", c, " --> ", c_map)
        bw_out.addEntries(list(itertools.repeat(c_map, len(c_int))), [x[0] for x in c_int], ends=[x[1] for x in c_int], values=[x[2] for x in c_int])

    bw_out.close()
    bw.close()

    if (verbose):
        print("\nbigwig conversion finished!\n")
Ejemplo n.º 20
0
def getBigWigMean(regions, bigwig_file, non_nan):
    bw=pyBigWig.open(bigwig_file)
    Profile=[]
    if non_nan ==1: #average over non_nan region
        for region in regions: 
            tmp=bw.stats(str(region.chrom), region.start,region.stop)  # nan considered missing
            if tmp[0]==None:                                           # average over non_nan region
               tmp[0]=0
            Profile.append(tmp[0])
    else:           #average over whole region, default
        for region in regions: 
            values=bw.values(str(region.chrom),region.start,region.stop) # nan considered as 0
            Profile.append(np.mean(np.nan_to_num(values))) #       #average over the whole region
    return Profile
Ejemplo n.º 21
0
    def __init__(self, cov_file):
        self.cov_file = cov_file
        self.bigwig = False

        cov_ext = os.path.splitext(self.cov_file)[1].lower()
        if cov_ext in ['.bw', '.bigwig']:
            self.cov_open = pyBigWig.open(self.cov_file, 'r')
            self.bigwig = True
        elif cov_ext in ['.h5', '.hdf5', '.w5', '.wdf5']:
            self.cov_open = h5py.File(self.cov_file, 'r')
        else:
            print('Cannot identify coverage file extension "%s".' % cov_ext,
                  file=sys.stderr)
            exit(1)
Ejemplo n.º 22
0
def initBigWig(exampleBW, outname):
    """!
    Initiates a bigWig file
    
    @param exampleBW String: name of the bigWig file from which to use the header info
    @param outname String: prefix for the file name of the bigWig file to be created

    @return Opens a bigWig file and adds a header so that it's ready to
    receive more information.
    """

    # read the header info from an example bigWig
    bw_check = pyBigWig.open(exampleBW)
    # extract the chromosome information
    chrom_info = bw_check.chroms()
    # wrangle the chrom info into a list of tuples
    chrom_info = [tuple(i) for i in chrom_info.items()]

    #open a new bigWig file
    bw_out = pyBigWig.open(outname + '.bw', "w")
    bw_out.addHeader(chrom_info)

    return bw_out
Ejemplo n.º 23
0
 def get_chrom_info(self, chrom_name):
     pyBigWig_object = pyBigWig.open(self.bigWig_file)
     chrom_stats_dict = {
         'chrom_name':
         chrom_name,
         'chrom_len':
         pyBigWig_object.chroms(chrom_name),
         'chrom_mean':
         pyBigWig_object.stats(chrom_name, type='mean', exact=True)[0],
         'chrom_std':
         pyBigWig_object.stats(chrom_name, type='std', exact=True)[0]
     }
     pyBigWig_object.close()
     return chrom_stats_dict
Ejemplo n.º 24
0
def group_and_process_data(bigwig_data, input_data, feature_name):
    bw_file = pw.open(bigwig_data)

    if not bw_file.isBigWig():
        print("The given file is not in BigWig format!!!")

    data_grouped = [group for key, group in input_data.groupby("CHROM")]

    for group in data_grouped:
        group = extract_data(group, feature_name, bw_file)

    data_combined = pd.concat(data_grouped)

    return data_combined
Ejemplo n.º 25
0
def _to_bigwig(self, path, chromosome_sizes, rpm=True, divide=False, value_col=None, dryrun=False):

    try:
        import pyBigWig
    except ModuleNotFoundError:
        print("pybigwig must be installed to create bigwigs. Use `conda install -c bioconda pybigwig` or `pip install pybigwig` to install it.")
        import sys
        sys.exit(1)

    if not divide:
        gr = self.to_rle(rpm=rpm, strand=False, value_col=value_col).to_ranges()
    else:
        gr = self.to_rle(rpm=rpm, strand=False, value_col=value_col)
        divide_by = self.to_rle(rpm=rpm, strand=False)
        c = (gr / divide_by)
        new_pyrles = {}
        for k, v in c.items():
            v.values = np.log2(v.values)
            v.defragment()
            new_pyrles[k] = v

        gr = c.defragment().to_ranges()

    unique_chromosomes = gr.chromosomes

    subset = ['Chromosome', 'Start', 'End', 'Score']

    gr = gr[subset].unstrand()

    gr = gr.sort()

    if dryrun:
        return gr

    if not isinstance(chromosome_sizes, dict):
        size_df = chromosome_sizes.df
        chromosome_sizes = {k: v for k, v in zip(size_df.Chromosome, size_df.End)}

    header = [(c, int(chromosome_sizes[c])) for c in unique_chromosomes]

    bw = pyBigWig.open(path, "w")
    bw.addHeader(header)

    for chromosome, df in gr:
        chromosomes = df.Chromosome.tolist()
        starts = df.Start.tolist()
        ends = df.End.tolist()
        values = df.Score.tolist()

        bw.addEntries(chromosomes, starts, ends=ends, values=values)
Ejemplo n.º 26
0
def get_bigWig_scores(map_args, def_param=(scores1, scores2)):
    """                                                                                                           
    Inner loop for multithreading over bigWig score features.                                                     
    """
    (i, train, Peak, opt) = map_args
    bw = pyBigWig.open(Peak)
    row = train.iloc[i]
    anchor1, anchor2 = prepare_anchors(row, opt.cons_extension)
    con1 = sum(bw.values(anchor1.chrom, anchor1.start, anchor1.end))
    con2 = sum(bw.values(anchor2.chrom, anchor2.start, anchor2.end))
    lock.acquire()
    scores1[i] = (con1 + con2) / 2.0
    scores2[i] = np.std([con1, con2])
    lock.release()
Ejemplo n.º 27
0
def load_big_file(name, rel_path='data', is_abs_path=False):
    """
    Load bigwig file
    :param name: Name of the file or absolute path if is_abs_path is set to True
    :type name: str
    :param rel_path: Relative path without the name from current directory
    :type rel_path: str
    :param is_abs_path: If True, name is interpreted as absolute path.
    :type is_abs_path: bool
    :return: bigWigFile object
    """
    path = set_path(name, rel_path=rel_path, is_abs_path=is_abs_path)
    file = pyBigWig.open(path)
    return file
Ejemplo n.º 28
0
    def __init__(self, wig_location):
        """
        Parameters
        ---------
        wig_location : string
                       Path to wig file

        """
        self.wig_location = wig_location
        try:
            self.wig = pyBigWig.open(self.wig_location)
        except Exception as e:
            raise Exception('Error reading wig file {} : {}'.format(
                os.path.abspath(self.wig_location), e))
def calculateScalerForNorm(resultMeta):
	norm_sum = 0

	for i in range(len(resultMeta)):
		tempFile = pyBigWig.open(resultMeta[i][0])
		norm_sum = norm_sum + float(tempFile.header().get('sumData'))
		tempFile.close()

	scalerNorm = read.CTRLBW_SUM / float(norm_sum)  

	print("Scaler:")

	print(scalerNorm)
	return scalerNorm
Ejemplo n.º 30
0
def compare_main(args):
    operation_dict = {
        "log2ratio": compare_log2ratio,
        "add": compare_add,
        "subtract": compare_subtract,
        "divide": compare_divide,
        "recipratio": compare_recipratio
    }
    #read in files
    inf1 = pyBigWig.open(args.infile1)
    inf2 = pyBigWig.open(args.infile2)

    arrays1 = bigwig_to_arrays(inf1, res=args.res)
    arrays2 = bigwig_to_arrays(inf2, res=args.res)

    # perform operation
    arrays_out = operation_dict[args.operation](arrays1, arrays2)

    # write out file
    write_arrays_to_bigwig(args.outfile, arrays_out, inf1.chroms(), \
            res = args.res, dropNaNsandInfs = args.dropNaNsandInfs)
    inf1.close()
    inf2.close()
Ejemplo n.º 31
0
def extract_bigwig_worker(lines, bwFile=None, stepSize=1, stranded=1, bw=None):
    ''' Helper mapper for querying BigWig
'''
    bw = pybw.open(bwFile)
    chromL = bw.chroms()

    lines = [x for x in lines if x]
    nField = lines[0].strip().split('\t').__len__()
    res = []
    for line in lines:
        #     def parse(line, nField = nField):
        if line is None:
            return None
        cols = line.strip().split('\t')
        if nField >= 6:
            chrom, start, end, (id, score, strand) = cols[0], int(
                cols[1]), int(cols[2]), cols[3:6]
        else:
            strand = '+'
            if nField is 5:
                chrom, start, end, id, _ = cols[0], int(cols[1]), int(
                    cols[2]), cols[3], cols[4]

#                 assert 0, 'operation not defined when bedFile has 5 fields:\n%s'%lines[0]
            elif nField is 4:
                chrom, start, end, id = cols[0], int(cols[1]), int(
                    cols[2]), cols[3]
            else:
                chrom, start, end = cols[0], int(cols[1]), int(cols[2])
                id = 'NoID'

        if chrom not in bw.chroms():
            o = None
        else:
            start = max(0, start)
            end = min(chromL[chrom], end)
            sec = bw.values(chrom, start, end, numpy=0)
            if strand is not '-' or not stranded:
                vals = sec[::stepSize]
            else:
                vals = sec[::-stepSize]

            o = vals
#         return (id,o)
        res += [(id, o)]


#     res = map( parse, lines)
    bw.close()
    return res
Ejemplo n.º 32
0
def create_bw(name, chr_list, len_list):
    """
    create a bigwiggle with random binary values
    """
    file = bg.open(name + ".bw", "w")
    header = [(chr_list[i], len_list[i] + 1) for i in range(len(chr_list))]
    print(header)
    file.addHeader(header)
    for i in range(len(chr_list)):
        valeurs = [rd.randint(0, 1) for i in range(len_list[i])]
        places = [k for k in range(len_list[i] + 1)]
        ends = places + [places[-1] + 1]
        chrome = [chr_list[i]] * len_list[i]
    file.addEntries(chrome, places, values=valeurs, span=1)
Ejemplo n.º 33
0
def main(LineArgs):
	T0 = time.time()
	# Data Input
	AcetylFilePath = LineArgs.RespVarFilePath
	InputFilePath = LineArgs.InputFilePath
	OutputFilePath = LineArgs.OutputFilePath
	TranscriptPath = LineArgs.TranscriptPath
	# load transcript defs
	TranscriptDF = pd.read_csv(TranscriptPath, header="infer", sep="\t")
	# load ChIP-seq data
	Ac_BHW = pyBigWig.open(AcetylFilePath)
	Inp_BHW = pyBigWig.open(InputFilePath)
	# Precompute ranges for signal calculations
	BinRanges = computeBinRanges(40, 250)
	Header = buildHeader(40)
	# get signal from region designated
	print("start resp var collect:")
	TranscriptDF["SignalOutput"] = TranscriptDF.apply(getSignal, 
									args=(Ac_BHW, 
										Inp_BHW,
										BinRanges), 
									axis=1)
	TranscriptDF[Header] = pd.DataFrame(TranscriptDF.SignalOutput.values.tolist(), 
							index= TranscriptDF.index)
	# remove unparse col of signal types, and unneeded cols
	TranscriptDF.drop(["SignalOutput", "Bins"], inplace=True, axis=1)
	print("Printing to file", OutputFilePath)
	TranscriptDF.to_csv(OutputFilePath, header=True,sep="\t",index=False)
	#
	T1 = time.time()
	Time = T1 - T0
	print("Total Raw feature time to complete,", str(Time)+"s")
	print("Start Feature calculation pipe:")
	RespVarDF = calcRespVars(TranscriptDF)
	OutputFilePath = OutputFilePath.replace(".txt", "_FinalSignal.txt")
	print("Printing to file", OutputFilePath)
	RespVarDF.to_csv(OutputFilePath, header=True,sep="\t",index=True)
Ejemplo n.º 34
0
def make_bigwig(bigwig_in_path, bigwig_out_path, header_count_path):
    """make bigwig from bam"""
    bw = pyBigWig.open(str(bigwig_in_path))
    bw_out = pyBigWig.open(str(bigwig_out_path), "w")
    header = []

    with open(str(header_count_path), "r") as header_count:
        for line in header_count:
            contig = line.split(' ')[0]
            length = bw.chroms(contig)
            if not length:
                continue
            header.append((contig, length))

    if not header:
        return None

    bw_out.addHeader(header)

    with open(str(header_count_path), "r") as header_count:
        for line in header_count:
            contig = line.split(' ')[0]
            length = bw.chroms(contig)
            if not length:
                continue
            values = bw.values(contig, 0, length)
            entry = (contig, values)
            bw_out.addEntries(entry[0],
                              1,
                              values=entry[1],
                              span=1,
                              step=1,
                              validate=False)

    bw_out.close()

    return None
Ejemplo n.º 35
0
    def __init__(self,
                 reference_sequence,
                 target_path,
                 features,
                 seed=436,
                 validation_holdout=['chr6', 'chr7'],
                 test_holdout=['chr8', 'chr9'],
                 sequence_length=1000,
                 bin_size=200,
                 step_size=100,
                 bins_start=200,
                 bins_end=800,
                 feature_thresholds=0.5,
                 mode="train",
                 save_datasets=[],
                 output_dir=None,
                 additional_bw_files=None):
        super(RandomFilesSampler,
              self).__init__(reference_sequence,
                             target_path,
                             features,
                             seed=seed,
                             validation_holdout=validation_holdout,
                             test_holdout=test_holdout,
                             sequence_length=sequence_length,
                             bin_size=bin_size,
                             step_size=step_size,
                             bins_start=bins_start,
                             bins_end=bins_end,
                             feature_thresholds=feature_thresholds,
                             mode=mode,
                             save_datasets=save_datasets,
                             output_dir=output_dir)

        self._sample_from_mode = {}
        self._randcache = {}
        for mode in self.modes:
            self._sample_from_mode[mode] = None
            self._randcache[mode] = {"cache_indices": [], "sample_next": 0}

        self.sample_from_intervals = []
        self.interval_lengths = []
        self.initialized = False

        #add additional bigWig files
        self.all_bw_files = []
        if not additional_bw_files is None:
            for file_name in additional_bw_files:
                self.all_bw_files.append(pyBigWig.open(file_name))
Ejemplo n.º 36
0
def dyad_coverage_sample(sample, genes, minp, maxp, smoothing=None):
    '''Finds the distribution of ditances between fragments and dyad for a single sample.'''
    print ('Finds the distribution of ditances between fragments and dyad of sample {}'.format(sample))
    if not smoothing:
        smoothing = 0
    smoothing = math.ceil(smoothing / 2.0)
    bw = pbw.open(sample + '-cov.bw')
    distances = [[] for i in range(0, maxp - minp + smoothing * 2 + 1)]
    for index, columns in genes.iterrows():
        chromosome = columns[1]
        max_end = bw.chroms(chromosome)
        if not max_end:
            max_end = 0
        negative = columns[4] == NEGATIVE_STRAND
        theo_start = int(columns[6]) + minp - smoothing
        start = max(theo_start, 0)
        end = min(int(columns[6]) + maxp + smoothing + 1, max_end)
        distance = signal(bw, chromosome, start, end) if end > start else []
        if negative:
            distance.reverse()
        for i in range(0, maxp - minp + smoothing * 2 + 1):
            distance_index = i - (start - theo_start)
            value = distance[distance_index] if distance_index in range(0, len(distance)) else 0
            distances[i].append(value if value and not math.isnan(value) else 0)
    for i in range(0, maxp - minp + smoothing * 2 + 1):
        genes['dyad position ' + str(i + minp - smoothing)] = distances[i]
    genes_output = sample + '-genes.txt'
    genes.to_csv(genes_output, sep='\t', index=False)
    sums = pd.DataFrame(index=list(range(minp - smoothing, maxp + smoothing + 1)))
    sums['Frequency'] = [genes['dyad position ' + str(i)].sum() for i in range(minp - smoothing, maxp + smoothing + 1)]
    dyads = pd.DataFrame(index=list(range(minp, maxp + 1)), columns=['Frequency', 'Relative Frequency'])
    for i in range(minp, maxp + 1):
        dyads.at[i, 'Frequency'] = mean([sums.at[j, 'Frequency'] for j in range(i - smoothing, i + smoothing)])
    frequency_sum = dyads['Frequency'].sum()
    for i in range(minp, maxp + 1):
        dyads.at[i, 'Relative Frequency'] = dyads.at[i, 'Frequency'] / frequency_sum
    dyad_output = sample + '-dyad.txt'
    dyads.to_csv(dyad_output, sep='\t')
    x = dyads.index.values
    plt.figure()
    plt.title(sample)
    plt.xlabel('Position relative to dyad (bp)')
    plt.ylabel('Relative Frequency')
    plt.xlim(x[0], x[len(x) - 1])
    plt.xticks(list(range(x[0], x[len(x) - 1] + 1, 25)))
    plt.plot(dyads.index.values, dyads['Relative Frequency'].values, color='red')
    plot_output = sample + '-dyad.png'
    plt.savefig(plot_output)
    plt.clf()
Ejemplo n.º 37
0
def parse_intron(options, chrom, start, end, strand, intron_info):
    # fetch fasta
    fa = check_fasta(options['--genome'])
    intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand)
    # load matrix
    matrix3 = load_matrix3()
    # parse options
    phastcons_f = pyBigWig.open(options['--bigwig'])
    min_distance = int(options['--min-distance'])
    min_score = float(options['--min-score'])
    min_phastcons = float(options['--min-phastcons'])
    # start to parse rs sites
    rs_list = []
    for m in re.finditer('AGGT', intron_fa):
        if strand == '+':
            pos = start + m.start() + 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
            ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 20, pos + 3))
            if ss3_seq.find('N') != -1:  # ensure there is no N
                continue
            ss3, score_flag = cal_score(ss3_seq, matrix3, min_score)
            if not score_flag:  # not high score
                continue
        else:
            pos = end - m.start() - 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
            ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 3, pos + 20),
                                 strand='-')
            if ss3_seq.find('N') != -1:  # ensure there is no N
                continue
            ss3, score_flag = cal_score(ss3_seq, matrix3, min_score)
            if not score_flag:  # not high score
                continue
        phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0]
        if phastcons is None or phastcons < min_phastcons:  # not conserved
            continue
        rs_feature = '%d|%d|%d|%f|%f' % (pos, left_dist, right_dist, ss3,
                                         phastcons)
        rs_list.append(rs_feature)
    if rs_list:
        return (intron_info, rs_list)
    else:
        return (None, None)
Ejemplo n.º 38
0
    def _bigwig_extractor(datafile, intervals, out=None, **kwargs):
        nan_as_zero = kwargs.get('nan_as_zero', True)
        if out is None:
            width = intervals[0].stop - intervals[0].start
            out = np.zeros((len(intervals), width), dtype=np.float32)

        bw = pyBigWig.open(datafile)
        for index, interval in enumerate(intervals):
            out[index] = bw.values(interval.chrom, interval.start,
                                   interval.stop)
            if nan_as_zero:
                nan_to_zero(out[index])
        bw.close()

        return out
Ejemplo n.º 39
0
    def check():
        bw = pyBigWig.open('sample1.bw')
        header_keys = list(bw.header().keys())
        for k in [
                'maxVal', 'minVal', 'nBasesCovered', 'nLevels', 'sumData',
                'sumSquared', 'version'
        ]:
            assert k in header_keys

        # bigWig version should be independent of BAM input, so we can check
        # the value
        assert bw.header()['version'] == 4

        first_chrom = list(bw.chroms().keys())[0]
        assert isinstance(bw.stats(first_chrom)[0], float)
Ejemplo n.º 40
0
def run_file(args, chrom_genes):
    '''
    For genes in a chromosome, get tss/gene body and run fourier transform on the gene
    '''
    bw = pbw.open(args.in_bigwig, 'r')
    gene_count = 0
    with open(args.out_bed, 'w') as out:
        out.write('name\ttype\tid\tperiodicity\tintensity\n')
        for count, gene in chrom_genes.iterrows():
            run_gene(args, gene, out, bw)
            if count % 1000 == 0:
                print 'Parsed {gene_count} for {filename} at {chrom}'.format(
                    gene_count=gene_count, filename=out.name, chrom=args.chrom)
    bw.close()
    return 0
Ejemplo n.º 41
0
 def __init__(
         self,
         bws='/stor/work/Lambowitz/yaojun/Work/cfNA/tgirt_map/bed_files/merged_bed/coverage/unfragmented.{strand}.bigWig',
         exon_file='/stor/work/Lambowitz/ref/hg19_ref/genes/exons.gencode.bed.gz',
         cutoff=2,
         force=False):
     records = []
     self.high_cov_exons = '/stor/scratch/Lambowitz/cdw2854/high_cov_exon.bed'
     self.exon_file = exon_file
     self.bws = {
         strand: pbw.open(bws.format(strand=strand_label))
         for strand, strand_label in zip(['-', '+'], ['rvs', 'fwd'])
     }
     if not os.path.isfile(self.high_cov_exons) or force:
         self.initiate(cutoff=cutoff)
Ejemplo n.º 42
0
 def doWrite(self, bw):
     ofile = tempfile.NamedTemporaryFile(delete=False)
     oname = ofile.name
     ofile.close()
     bw2 = pyBigWig.open(oname, "w")
     assert(bw2 is not None)
     #Since this is an unordered dict(), iterating over the items can swap the order!
     chroms = [("1", bw.chroms("1")), ("10", bw.chroms("10"))]
     assert(len(bw.chroms()) == 2)
     bw2.addHeader(chroms, maxZooms=1)
     #Copy the input file
     for c in chroms:
         ints = bw.intervals(c[0])
         chroms2 = []
         starts = []
         ends = []
         values = []
         for entry in ints:
             chroms2.append(c[0])
             starts.append(entry[0])
             ends.append(entry[1])
             values.append(entry[2])
         bw2.addEntries(chroms2, starts, ends=ends, values=values)
     bw2.close()
     #Ensure that the copied file has the same entries and max/min/etc.
     bw2 = pyBigWig.open(oname)
     assert(bw.header() == bw2.header())
     assert(bw.chroms() == bw2.chroms())
     for c in chroms:
         ints1 = bw.intervals(c[0])
         ints2 = bw2.intervals(c[0])
         assert(ints1 == ints2)
     bw.close()
     bw2.close()
     #Clean up
     os.remove(oname)
Ejemplo n.º 43
0
 def test_bigwig(self):
     """
     Check that encoded ipdRatios in the BigWig output are consistent with
     modified bases in the GFF file (albeit with lower precision).
     """
     import pyBigWig
     f = pyBigWig.open(self.bw_file)
     for (seqid, start, strand), rec in self.gff_dict.iteritems():
         s = int(f.values(seqid, start - 1, start)[0])
         ipd_minus = (s % 65536) / 100.0
         ipd_plus = (s >> 16) / 100.0
         if strand == "+":
             self.assertAlmostEqual(rec.IPDRatio, ipd_plus, places=1)
         else:
             self.assertAlmostEqual(rec.IPDRatio, ipd_minus, places=1)
Ejemplo n.º 44
0
 def test_bigwig(self):
     """
     Check that encoded ipdRatios in the BigWig output are consistent with
     modified bases in the GFF file (albeit with lower precision).
     """
     import pyBigWig
     f = pyBigWig.open(self.bw_file)
     for (seqid,start,strand), rec in self.gff_dict.iteritems():
         s = int(f.values(seqid, start-1, start)[0])
         ipd_minus = (s % 65536) / 100.0
         ipd_plus = (s >> 16) / 100.0
         if strand == "+":
             self.assertAlmostEqual(rec.IPDRatio, ipd_plus, places=1)
         else:
             self.assertAlmostEqual(rec.IPDRatio, ipd_minus, places=1)
Ejemplo n.º 45
0
def test_extract_bigwig_to_numpy(tmpdir):
    """
    Tests extract_bigwig_to_numpy function with padding values. \
    Uses pyBigWig to construct sample bigWig to draw values from. \
    Compares expected numpy array with computed numpy array.
    """
    tmpbigwig = os.path.join(tmpdir, "tmp.bigwig")
    bw = pyBigWig.open(tmpbigwig, "w")
    bw.addHeader([('chr1', 20)], maxZooms=0)
    bw.addEntries(['chr1', 'chr1'], [0, 11], ends=[5, 20], values=[3.0, 7.0])
    bw.close()

    bw = pyBigWig.open(tmpbigwig)
    sizes = {'chr1': 20}
    pad = 5
    interval1 = ['chr1', 2, 4]  # -3 to 9
    interval2 = ['chr1', 14, 17]  # 9 to 22
    output1 = bigwigio.extract_bigwig_to_numpy(interval1, bw, pad, sizes)
    output2 = bigwigio.extract_bigwig_to_numpy(interval2, bw, pad, sizes)
    expected1 = np.array([0, 0, 0, 3.0, 3.0, 3.0, 3.0, 3.0, 0, 0, 0, 0])
    expected2 = np.array(
        [0, 0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 0, 0])
    assert np.allclose(expected1, output1)
    assert np.allclose(expected2, output2)
Ejemplo n.º 46
0
 def doWrite(self, bw):
     ofile = tempfile.NamedTemporaryFile(delete=False)
     oname = ofile.name
     ofile.close()
     bw2 = pyBigWig.open(oname, "w")
     assert (bw2 is not None)
     #Since this is an unordered dict(), iterating over the items can swap the order!
     chroms = [("1", bw.chroms("1")), ("10", bw.chroms("10"))]
     assert (len(bw.chroms()) == 2)
     bw2.addHeader(chroms, maxZooms=1)
     #Copy the input file
     for c in chroms:
         ints = bw.intervals(c[0])
         chroms2 = []
         starts = []
         ends = []
         values = []
         for entry in ints:
             chroms2.append(c[0])
             starts.append(entry[0])
             ends.append(entry[1])
             values.append(entry[2])
         bw2.addEntries(chroms2, starts, ends=ends, values=values)
     bw2.close()
     #Ensure that the copied file has the same entries and max/min/etc.
     bw2 = pyBigWig.open(oname)
     assert (bw.header() == bw2.header())
     assert (bw.chroms() == bw2.chroms())
     for c in chroms:
         ints1 = bw.intervals(c[0])
         ints2 = bw2.intervals(c[0])
         assert (ints1 == ints2)
     bw.close()
     bw2.close()
     #Clean up
     os.remove(oname)
Ejemplo n.º 47
0
    def get_signal_dict(self, bigwigs):
        """ Get dict of signal[region.tup][bigwig] = signal """

        signal_dict = {
            region.tup(): {bigwig: []
                           for bigwig in bigwigs}
            for region in self
        }
        for bigwig in bigwigs:
            pybw = pyBigWig.open(bigwig, "rb")
            for region in self:
                signal_dict[region.tup()][bigwig] = region.get_signal(pybw)
            pybw.close()

        return (signal_dict)
Ejemplo n.º 48
0
    def __init__(self, infile):
        u"""
        初始化
        """
        self.bigbed, self.bigwig = None, None
        if os.path.exists(infile):
            bigfile = pyBigWig.open(infile)

            if bigfile.isBigWig():
                self.bigwig = bigfile
            elif bigfile.isBigBed():
                self.bigbed = bigfile
            else:
                raise ValueError('%s is not a legit file' % infile)
            self.chroms = bigfile.chroms()
Ejemplo n.º 49
0
def GetNormalizationFactorBasedOnRegionList(bigwig_fn, regionlist):
    """
    regionlist is a list of lists. Each list entries is formatted like this:
    ["chr1", 1, 10].
    """
    try:
        bw = pyBigWig.open(bigwig_fn)
        TotalSignal = []
        for entry in regionlist:
            EntrySignal = sum(
                bw.values(RegionChr, RegionStart, RegionStop, numpy=True))
            TotalSignal.append(EntrySignal)
        return (sum(TotalSignal) / 1E3)
    except:
        return (None)
Ejemplo n.º 50
0
def main(BIGBED, trackDB, BED, out_path):
    '''
    Create a output dir if needed, generate decoding library, exctract gene name
    , use this to extract Ensembl ID. Write to bed file with gene name and
    Ensembl id.
    '''
    # create outdir if needed
    if not os.path.isdir(os.path.join(out_path, 'GTRD_BED')):
        os.mkdir(os.path.join(out_path, 'GTRD_BED'))

    # Legacy as requires internet connection
    # load mygene tool
    #mg = mygene.MyGeneInfo()

    # load annotation to retreive ensemble ids
    ensemblDecoder = ensemblIDextract(BED)

    # load schema dictionary
    TF_ID = trackDBparser(trackDB)

    # extract gene name using this schema
    gene_name = TF_ID[os.path.basename(BIGBED).split('_')[1]][0]

    # Legacy as requires internet connection
    # use mygene tool to extract ensembl id
    #ensembl_id = mg.query(gene_name, fields = 'ensembl.gene', species = 'fruitfly')['hits'][0]['ensembl']['gene']

    #use reference annotation to retreive ensembl id
    ensembl_id = ensemblDecoder[gene_name]

    # open bigbed file
    bb = pbw.open(BIGBED)

    # intergrate this information with coordinates of binding events, write .bed
    with open(
            os.path.join(out_path, 'GTRD_BED',
                         os.path.basename(BIGBED).replace('.bb', '.bed')),
            'w') as outfile:
        for chrom, limit in bb.chroms().items():

            for bindingEvent in bb.entries(chrom, 0, limit):
                start = bindingEvent[0]
                stop = bindingEvent[1]

                outfile.write('\t'.join([
                    str(x)
                    for x in [chrom, start, stop, ensembl_id, gene_name]
                ]) + '\n')
Ejemplo n.º 51
0
    def bw_to_dict(self, chrs, window_size=25):
        """
    Function taken from evaluation scripts - https://github.com/ENCODE-DCC/imputation_challenge/blob/master/build_npy_from_bigwig.py
    Each chromosome is binned into ((chrom_len-1)//window_size)+1 nonoverlapping bins of size window_size
    NaN values are converted into zeros before averaging over the bins
    Because the ends of the bigwig files contain NaNs - regions somehow not measured,
            a naive bin and then average is liable to cause problems in the first bin which contains nans.
            Perhaps the simplest solution would just be to use nanmean, and only replace nans after averaging.
            But I've stuck with the provided script for now
    """
        bw = pyBigWig.open(self.track)
        for c in chrs:
            print('Reading chromosome {} from bigwig...'.format(c), flush=True)
            chrom_len = bw.chroms()[c]
            # print(chrom_len, window_size)
            num_step = (
                (chrom_len - 1) // window_size
            ) + 1  # number of bins ensuring all positions are included
            raw = bw.values(
                c, 0, chrom_len,
                numpy=True)  # reshape raw vector as (num_step, window_size)
            raw.resize(num_step *
                       window_size)  # typically greater than chrom len
            # print number of nans (effectively 0s - we should ignore 0s somehow)
            # print(np.sum(np.isnan(raw)))
            raw = np.nan_to_num(
                raw
            )  # pyBigWig returns nan for values out of bounds - convert to zero
            raw = np.reshape(raw, (-1, window_size))  # bin it
            result_per_chr = raw.mean(axis=1)  # average over bins

            # special treatment for last step [i.e. last step with non nan values] (where the first nan is)
            # above averaging method does not work with the end step - because we've added zeros instead of nans
            # bw.intervals(c)[-1] is the last interval in bigwig
            # (248933861, 248934005, 0.08760000020265579)
            last_interval_end = bw.intervals(c)[-1][
                1]  # find the end location of the last interval. after this we will have nans
            last_step = last_interval_end // window_size  # where does our last valid window end
            start = last_step * window_size  # where should our first special treatment window start
            end = min((last_step + 1) * window_size, chrom_len)
            stat = bw.stats(c, start, end, exact=True)
            # pdb.set_trace()
            if stat[0] is None:
                result_per_chr[last_step] = 0.0
            else:
                result_per_chr[last_step] = stat[0]

            self.binned_chroms[c] = np.array(result_per_chr)
Ejemplo n.º 52
0
def bdg2bw(bdgFile, bwFile, chromSize):
    with open(chromSize) as f:
        cs = [line.strip().split('\t') for line in f.readlines()]

    bw = pyBigWig.open(bwFile, "w")
    bw.addHeader([(str(x[0]), int(x[1])) for x in cs])

    with open(bdgFile, "r") as bdg:
        for line in bdg:
            if len(line.strip().split("\t")) == 4:
                chr, start, end, val = line.strip().split("\t")
                bw.addEntries(chroms=[chr], starts=[int(start)], ends=[int(end)], values=[float(val)])
            else:
                print("[%s] Warning: skipping bedGraph entry: %s" % (timestamp(), line.strip()))

    bw.close()
Ejemplo n.º 53
0
def dhstobw(bamfile, bwfile, library='Duke'):
    # Washington is under processing
    """

    :param bamfile:
    :param bwfile:
    :param library:Duke or Washington

        Duke: |=====>
                        <=====|

        Washington: |===========|

        Out put cutting site '|'
    :return:
    """

    bamfor = Baminfo.Baminfo(bamfile)

    bw = pyBigWig.open(bwfile, "w")

    bw.addHeader(list(bamfor.chrlen.items()))

    for chromosome in bamfor.chrlen:

        end = bamfor.chrlen[chromosome]

        dhscut = dhsbam.dhcutcount(bamfile=bamfile, chromosome=chromosome, start=1,
                                   end=end, library=library)

        if dhscut:

            starts = list()

            values = list()

            for start in sorted(dhscut):
                starts.append(start)

                values.append(float(dhscut[start]))

            bw.addEntries(chromosome, starts=starts, values=values,
                          span=1, step=1)

    bw.close()
Ejemplo n.º 54
0
def main():
    usage = 'usage: %prog [options] <out_h5_file> <in_bw_file>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='chr', default=None, help='Comma-separated chromosome list')
    parser.add_option('-v', dest='verbose', default=False, action='store_true')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide input HDF5 and output BigWig.')
    else:
        hdf5_file = args[0]
        bw_file = args[1]

    # open files
    h5_in = h5py.File(hdf5_file)
    bw_out = pyBigWig.open(bw_file, 'w')

    # construct header
    if options.chr is not None:
        chroms = ['chr%s'%c for c in options.chr.split(',')]
    else:
        chroms = sorted(h5_in.keys())

    header = []
    for chrom in chroms:
        # chromosome and length
        header.append((chrom,len(h5_in[chrom])))

    # write header
    bw_out.addHeader(header)

    for chrom, length in header:
        if options.verbose:
            print(chrom)

        # read values
        x = np.array(h5_in[chrom])

        # write gzipped into HDF5
        bw_out.addEntries(chrom, 0, values=x, span=1, step=1)

    # close files
    h5_in.close()
    bw_out.close()
Ejemplo n.º 55
0
def _create_bigwig(bed_column, outpath, genome_size_dict):
    # type: (pd.Series, str, Dict[str, int]) -> None

    logging.info("Creating biwgwig " + outpath)

    bed_column = bed_column.reset_index()

    values = [float(f) for _, _, _, f in bed_column.values]
    unique_chromosomes = list(bed_column.Chromosome.drop_duplicates())
    chromosomes = list(bed_column.Chromosome)
    starts = _to_int(list(bed_column.Bin))
    ends = _to_int(list(bed_column.End + 1))

    header = [(c, int(genome_size_dict[c])) for c in unique_chromosomes]

    bw = pyBigWig.open(outpath, "w")
    bw.addHeader(header)

    bw.addEntries(chromosomes, starts, ends=ends, values=values)
    bw.close()
Ejemplo n.º 56
0
    def coverage_from_bigwig(self, bigwig_file, stepsize=100):

        """Return list of arrays describing the coverage of each genomicRegions from <bigwig_file>.
        
        *Keyword arguments:*
        
        - bigwig_file -- path to bigwig file
        - stepsize -- used stepsize
        
        *Output:*
        
        Class variable <coverage>: a list where the elements correspond to the GenomicRegion. The list elements give
        the number of reads falling into the GenomicRegion.
        
        """
        try:
            from ngslib import BigWigFile
            self.coverage = []
            bwf = BigWigFile(bigwig_file)

            for gr in self.genomicRegions:
                depth = bwf.pileup(gr.chrom, max(0, int(gr.initial - stepsize / 2)),
                                   max(1, int(gr.final + stepsize / 2)))
                ds = [depth[d] for d in range(0, gr.final - gr.initial, stepsize)]
                self.coverage.append(np.array(ds))
            bwf.close()

        except ImportError, e:
            import pyBigWig
            self.coverage = []
            bwf = pyBigWig.open(bigwig_file)

            for gr in self.genomicRegions:
                steps = int(len(gr) / stepsize)
                ds = bwf.stats(gr.chrom, gr.initial, gr.final, type="mean", nBins=steps)
                ds = [ x if x else 0 for x in ds ]
                self.coverage.append( np.array(ds) )
            bwf.close()
Ejemplo n.º 57
0
def main():
    usage = 'usage: %prog [options] <in_zarr_file> <out_bw_file>'
    parser = OptionParser(usage)
    parser.add_option('-v', dest='verbose', default=False, action='store_true')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide input HDF5 and output BigWig.')
    else:
        zarr_file = args[0]
        bw_file = args[1]

    # open files
    zarr_in = zarr.open_group(zarr_file, 'r')
    bw_out = pyBigWig.open(bw_file, 'w')

    # construct header
    header = []
    chroms = sorted(zarr_in.keys())
    for chrom in chroms:
        # chromosome and length
        header.append((chrom,len(zarr_in[chrom])))

    # write header
    bw_out.addHeader(header)

    for chrom, length in header:
        if options.verbose:
            print(chrom)

        # read values
        x = np.array(zarr_in[chrom])

        # write gzipped into HDF5
        bw_out.addEntries(chrom, 0, values=x, span=1, step=1)

    # close files
    bw_out.close()
def countFragmentsInRegions_worker(chrom, start, end,
                                   bigWigFiles,
                                   stepSize, binLength,
                                   save_data,
                                   bedRegions=None
                                   ):
    """ returns the average score in each bigwig file at each 'stepSize'
    position within the interval start, end for a 'binLength' window.
    Because the idea is to get counts for window positions at
    different positions for sampling the bins are equally spaced
    and *not adjacent*.

    If a list of bedRegions is given, then the number of reads
    that overlaps with each region is counted.

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()

    Fragment coverage.
    >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 50, 25, False)[0])
    array([[ 1.,  1.,  2.,  2.],
           [ 1.,  1.,  1.,  3.]])

    >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200, [test.bwFile1, test.bwFile2], 200, 200, False)[0])
    array([[ 1.5],
           [ 1.5]])

    BED regions:
    >>> bedRegions = [(test.chrom, 45, 55), (test.chrom, 95, 105), (test.chrom, 145, 155)]
    >>> np.transpose(countFragmentsInRegions_worker(test.chrom, 0, 200,[test.bwFile1, test.bwFile2], 200, 200, False,
    ... bedRegions=bedRegions)[0])
    array([[ 1. ,  1.5,  2. ],
           [ 1. ,  1. ,  2. ]])
    """
    assert start < end, "start {} bigger that end {}".format(start, end)

    # array to keep the scores for the regions
    sub_score_per_bin = []

    rows = 0

    bigwig_handlers = [pyBigWig.open(bw) for bw in bigWigFiles]

    regions_to_consider = []
    if bedRegions:
        for chrom, start, end in bedRegions:
            regions_to_consider.append((chrom, start, end, end - start))
    else:
        for i in xrange(start, end, stepSize):
            if (i + binLength) > end:
                regions_to_consider.append((chrom, i, end, end - i))  # last bin (may be smaller)
            else:
                regions_to_consider.append((chrom, i, i + binLength, binLength))

    if save_data:
        _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t')
        _file_name = _file.name
    else:
        _file_name = ''
    warnings.simplefilter("default")
    i = 0
    for chrom, start, end, binLength in regions_to_consider:
        avgReadsArray = []
        i += 1

        for idx, bwh in enumerate(bigwig_handlers):
            if chrom not in bwh.chroms().keys():
                unmod_name = chrom
                if chrom.startswith('chr'):
                    # remove the chr part from chromosome name
                    chrom = chrom[3:]
                else:
                    # prefix with 'chr' the chromosome name
                    chrom = 'chr' + chrom
                if chrom not in bwh.chroms().keys():
                    exit('Chromosome name {} not found in bigwig file\n {}\n'.format(unmod_name, bigWigFiles[idx]))

            score = bwh.stats(chrom, start, end)

            if score is None or score == [None] or np.isnan(score[0]):
                score = [np.nan]
            avgReadsArray.append(score[0])  # mean of fragment coverage for region
        # print "{} Region: {}:{:,}-{:,} {}  {} {}".format(i, chrom, start, end, binLength, avgReadsArray[0], avgReadsArray[1])

        sub_score_per_bin.extend(avgReadsArray)
        rows += 1
        if save_data:
            _file.write("\t".join(map(str, [chrom, start, end])) + "\t")
            _file.write("\t".join(["{}".format(x) for x in avgReadsArray]) + "\n")

    if save_data:
        _file.close()
    warnings.resetwarnings()

    # the output is a matrix having as many rows as the variable 'row'
    # and as many columns as bigwig files. The rows correspond to
    # each of the regions processed by the worker.
    # np.array([[score1_1, score1_2],
    #           [score2_1, score2_2]]
    return np.array(sub_score_per_bin).reshape(rows, len(bigWigFiles)), _file_name
 def run_after(self, rtc, output_dir):
     bw = pyBigWig.open(rtc.task.output_files[0])
     nrec = bw.header()["nBasesCovered"]
     self.assertEqual(nrec, 6, "{n} != 6".format(n=nrec))
     self.assertAlmostEqual(bw.stats("chr1", 2, 3)[0], 1.9, places=5)
     self.assertAlmostEqual(bw.stats("chr2", 7, 8)[0], 1.0, places=5)
Ejemplo n.º 60
0
    def preload(self, regions, tmpDir=None):
        """
        Given a sample and a set of regions, write a bigWig file containing the underlying signal.

        This function returns the file name, which needs to be deleted by the calling function at some point.

        This sends queries one chromosome at a time, due to memory limits on deepBlue
        """
        startTime = datetime.datetime.now()
        regions2 = mergeRegions(regions)

        # Make a temporary file
        f = tempfile.NamedTemporaryFile(delete=False, dir=tmpDir)
        fname = f.name
        f.close()

        # Start with the bigWig file
        bw = pyBigWig.open(fname, "w")
        bw.addHeader(self.chromsTuple, maxZooms=0)  # This won't work in IGV!

        # Make a string out of everything in a resonable order
        for k, v in self.chromsTuple:
            # Munge chromosome names as appropriate
            chrom = mungeChromosome(k, regions2.keys())
            if not chrom:
                continue
            if chrom not in regions2 or len(regions2) == 0:
                continue
            regionsStr = "\n".join(["{}\t{}\t{}".format(k, reg[0], reg[1]) for reg in regions2[chrom]])
            regionsStr += "\n"

            # Send the regions
            (status, regionsID) = self.server.input_regions(self.genome, regionsStr, self.userKey)
            if status != "okay":
                raise RuntimeError("Received the following error while sending regions for '{}': {}".format(regionsID, self.sample))

            # Get the experiment information
            (status, queryID) = self.server.select_experiments(self.sample, k, None, None, self.userKey)
            if status != "okay":
                raise RuntimeError("Received the following error while running select_experiments on file '{}': {}".format(self.sample, queryID))
            if not queryID:
                raise RuntimeError("Somehow, we received None as a query ID (file '{}')".format(self.sample))

            # Intersect
            (status, intersectID) = self.server.intersection(queryID, regionsID, self.userKey)
            if status != "okay":
                raise RuntimeError("Received the following error while running intersection on file '{}': {}".format(self.sample, intersectID))
            if not intersectID:
                raise RuntimeError("Somehow, we received None as an intersect ID (file '{}')".format(self.sample))

            # Query the regions
            (status, reqID) = self.server.get_regions(intersectID, "START,END,VALUE", self.userKey)
            if status != "okay":
                raise RuntimeError("Received the following error while fetching regions in file '{}': {}".format(self.sample, reqID))

            # Wait for the server to process the data
            (status, info) = self.server.info(reqID, self.userKey)
            request_status = info[0]["state"]
            while request_status != "done" and request_status != "failed":
                time.sleep(0.1)
                (status, info) = self.server.info(reqID, self.userKey)
                request_status = info[0]["state"]

            # Get the actual data
            (status, resp) = self.server.get_request_data(reqID, self.userKey)
            if status != "okay":
                raise RuntimeError("Received the following error while fetching data in file '{}': {}".format(self.sample, resp))

            for intervals in resp.split("\n"):
                interval = intervals.split("\t")
                if interval[0] == '':
                    continue
                bw.addEntries([k], [int(interval[0])], ends=[int(interval[1])], values=[float(interval[2])])
        bw.close()
        sys.stderr.write("{} done (took {})\n".format(self.sample, datetime.datetime.now() - startTime))
        sys.stderr.flush()

        return fname