def read_dir(self, d):
        lookup = dict()
        scores_by_what = dict()
        if not os.path.exists(d):
            for p in DATA_SEARCH_PATH:
                if os.path.exists(os.path.join(p, d)):
                    d = os.path.join(p, d)
                    break
        for file in os.listdir(d):
            if file.endswith(".match"):
                for line in (open(os.path.join(d, file))):
                    if line.startswith("#"): continue
                    line = line.strip()
                    f = line.split()
                    if len(f) >= 4:
                        chr, start, end, what = f[0:4]
                    else:
                        continue

                    if not chr in lookup: lookup[chr] = intervals.Intersecter()
                    lookup[chr].add_interval(
                        intervals.Interval(int(start), int(end), what))
            else:
                k = file.split('.')[0]
                if not k in scores_by_what:
                    scores_by_what[k] = FileBinnedArray(
                        open(os.path.join(d, file)))

        if lookup == {}:
            self.lookup = None
            self.scores = scores_by_what
            #return None, scores_by_what
        else:
            self.lookup = lookup
            self.scores = scores_by_what
Beispiel #2
0
def __main__():

    # Parse Command Line

    options, args = doc_optparse.parse(__doc__)

    try:
        range_filename = args[0]
        refindex = int(args[1])
        if options.mincols: mincols = int(options.mincols)
        else: mincols = 10
    except:
        doc_optparse.exit()

    # Load Intervals

    intersecter = intervals.Intersecter()
    for line in file(range_filename):
        fields = line.split()
        intersecter.add_interval(
            intervals.Interval(int(fields[0]), int(fields[1])))

    # Start axt on stdout

    out = bx.align.axt.Writer(sys.stdout)

    # Iterate over input axt

    for axt in bx.align.axt.Reader(sys.stdin):
        ref_component = axt.components[refindex]
        # Find overlap with reference component
        intersections = intersecter.find(ref_component.start,
                                         ref_component.end)
        # Keep output axt ordered
        intersections.sort()
        # Write each intersecting block
        for interval in intersections:
            start = max(interval.start, ref_component.start)
            end = min(interval.end, ref_component.end)
            sliced = axt.slice_by_component(refindex, start, end)
            good = True
            for c in sliced.components:
                if c.size < 1:
                    good = False
            if good and sliced.text_size > mincols: out.write(sliced)

    # Close output axt

    out.close()
def main():

    intersecters = {}

    # Read ranges

    for chr, start, end in read_intervals( misc.open_compressed( sys.argv[1] ) ):
        if not intersecters.has_key( chr ): intersecters[ chr ] = intervals.Intersecter()
        intersecters[ chr ].add_interval( intervals.Interval( start, end ) )

    # Count intersection

    total = 0

    for chr, start, end in read_intervals( misc.open_compressed( sys.argv[2] ) ):
        if intersecters.has_key( chr ):
            intersection = intersecters[ chr ].find( start, end )
            if intersection: 
                #print chr, intersection
                total += 1

    print total
def __main__():

    # Parse Command Line

    options, args = doc_optparse.parse(__doc__)

    try:
        assert len(args) > 0
    except:
        doc_optparse.exit()

    # Load Intervals

    intersector = intervals.Intersecter()

    for f in args:
        for line in file(f):
            if line.startswith("#") or line.isspace(): continue
            fields = line.split()
            intersector.add_interval(
                intervals.Interval(int(fields[0]), int(fields[1])))

    # Start MAF on stdout

    out = bx.align.maf.Writer(sys.stdout)

    # Iterate over input MAF

    for maf in bx.align.maf.Reader(sys.stdin):
        # Find overlap with reference component
        intersections = intersector.find(maf.components[0].start,
                                         maf.components[0].end)
        # Write only if no overlap
        if len(intersections) == 0:
            out.write(maf)

    # Close output MAF

    out.close()
def __main__():

    # Parse Command Line

    options, args = doc_optparse.parse( __doc__ )

    try:
        range_filename = args[ 0 ]
        try: 
            refindex = int( args[ 1 ] )
            refname = None
        except: 
            refindex = None
            refname = args[ 1 ]
        if options.mincols: mincols = int( options.mincols )
        else: mincols = 10
        if options.prefix: prefix = options.prefix
        else: prefix = ""
    except:
        doc_optparse.exit()

    # Load Intervals

    intersecters = dict()    
    for line in file( range_filename ):
        fields = line.split()
        src = prefix + fields[0]
        if not src in intersecters: intersecters[src] = intervals.Intersecter()
        intersecters[src].add_interval( intervals.Interval( int( fields[1] ), int( fields[2] ) ) )

    # Start MAF on stdout

    out = bx.align.maf.Writer( sys.stdout )

    # Iterate over input MAF

    for maf in bx.align.maf.Reader( sys.stdin ):
        if refname: 
            sourcenames = [ cmp.src.split('.')[0] for cmp in maf.components ]
            try: refindex = sourcenames.index( refname )
            except:
                continue

        ref_component = maf.components[ refindex ]
        # Find overlap with reference component
        if not ( ref_component.src in intersecters ): continue
        intersections = intersecters[ ref_component.src ].find( ref_component.start, ref_component.end )
        # Keep output maf ordered
        intersections.sort()
        # Write each intersecting block
        for interval in intersections: 
            start = max( interval.start, ref_component.start )
            end = min( interval.end, ref_component.end )
            sliced = maf.slice_by_component( refindex, start, end ) 
            good = True
            for c in sliced.components: 
                if c.size < 1: 
                    good = False
            if good and sliced.text_size > mincols: out.write( sliced )
         
    # Close output MAF

    out.close()
def main():

    if len(sys.argv) < 5:
        print >> sys.stderr, "%s bedfile inmaf spec1,spec2,... motif_file " % sys.argv[
            0]
        sys.exit(0)

    # read in intervals
    regions = {}
    for line in open(sys.argv[1]):
        if line.startswith('#'): continue
        fields = line.strip().split()
        chrom, start, end = fields[0], int(fields[1]), int(fields[2])
        try:
            name = fields[3]
        except:
            name = None
        if chrom not in regions: regions[chrom] = intervals.Intersecter()
        regions[chrom].add(start, end, name)

    pwm = {}
    for wm in pwmx.Reader(open(sys.argv[4])):
        pwm[wm.id] = wm
        print >> sys.stderr, wm.id, len(wm)

    inmaf = open(sys.argv[2])
    threshold = 0.5

    species = []

    for sp in sys.argv[3].split(','):
        species.append(sp)

    for maf in align_maf.Reader(inmaf):
        mafchrom = maf.components[0].src.split('.')[1]
        mafstart = maf.components[0].start
        mafend = maf.components[0].end
        reftext = maf.components[0].text

        # maf block scores for each matrix
        for scoremax, width, headers in MafBlockScorer(pwm, species, maf):
            #print >>sys.stderr,headers
            blocklength = width
            mafsrc, mafstart, mafend = headers[0]
            mafchrom = mafsrc.split('.')[1]

            # lists of scores for each position in scoremax
            for mx_name, mx in scoremax.items():
                #print >>sys.stderr, mx_name, len(pwm[mx_name])

                for offset in range(blocklength):

                    # scan all species with threshold
                    for i in range(len(species)):
                        if mx[i][offset] > threshold:
                            refstart = mafstart + offset - reftext.count(
                                '-', 0, offset)
                            refend = refstart + len(pwm[mx_name])

                            data = " ".join([
                                "%.2f" % mx[x][offset]
                                for x in range(len(species))
                            ])
                            # quote the motif
                            r = regions[mafchrom].find(refstart, refend)
                            if mafchrom in regions and len(r) > 0:
                                region_label = r[0].value
                            else:
                                #region_label = 0
                                continue
                            v_name = mx_name.replace(' ', '_')
                            print mafchrom, refstart, refend, region_label, v_name, data
                            break
    sys.exit()

##### INPUTS AND OUTPUTS #####
# get intervals
print "Loading data..."
p1_ints = np.loadtxt(options.i, 'str', delimiter="\t")
print "Calculating rank..."

# save as fast lookup
bedInts = {}
for j in range(0, len(p1_ints)):
    chr = p1_ints[j][0]
    start = int(p1_ints[j][1])
    end = int(p1_ints[j][2])
    if not bedInts.has_key(chr):
        bedInts[chr] = intervals.Intersecter()
    bedInts[chr].insert(start, end, np.append(p1_ints[j], j))

# pass filter idx
idx = np.array(np.ones(len(p1_ints)), dtype=int)

##### SCRIPT #####
# loop through files
print "Looping through intervals..."
for i in range(0, len(p1_ints)):
    # look for overlaps
    atacChr = p1_ints[i][0]
    atacStart = int(p1_ints[i][1])
    atacEnd = int(p1_ints[i][2])

    # if pass filter and only 1 match
def main():

    if len(sys.argv) < 5:
        print("%s bedfile inmaf spec1,spec2,... string [string2,...]" %
              sys.argv[0],
              file=sys.stderr)
        sys.exit(0)

    # read in intervals
    regions = {}
    for line in open(sys.argv[1]):
        if line.startswith('#'):
            continue
        fields = line.strip().split()
        chrom, start, end = fields[0], int(fields[1]), int(fields[2])
        try:
            name = fields[3]
        except IndexError:
            name = None
        if chrom not in regions:
            regions[chrom] = intervals.Intersecter()
        regions[chrom].add(start, end, name)

    motif_strings = sys.argv[4:]
    if not isinstance(motif_strings, list):
        motif_strings = [motif_strings]
    inmaf = open(sys.argv[2])
    threshold = 0.5

    species = []

    for sp in sys.argv[3].split(','):
        species.append(sp)

    for maf in align_maf.Reader(inmaf):
        mafchrom = maf.components[0].src.split('.')[1]
        mafstart = maf.components[0].start
        mafend = maf.components[0].end
        reftext = maf.components[0].text
        r = regions[mafchrom].find(mafstart, mafend)
        if mafchrom not in regions or len(r) == 0:
            continue

        # maf block scores for each matrix
        for scoremax, width, headers in MafMotifScorer(species, maf,
                                                       motif_strings):
            blocklength = width
            mafsrc, mafstart, mafend = headers[0]
            mafchrom = mafsrc.split('.')[1]

            # lists of scores for each position in scoremax
            for mx_name, mx in scoremax.items():
                for offset in range(blocklength):

                    # scan all species with threshold
                    for i in range(len(species)):
                        if mx[i][offset] > threshold:
                            refstart = mafstart + offset - reftext.count(
                                '-', 0, offset)
                            refend = refstart + len(mx_name)

                            data = " ".join([
                                "%.2f" % mx[x][offset]
                                for x in range(len(species))
                            ])
                            # quote the motif
                            r = regions[mafchrom].find(refstart, refend)
                            if mafchrom in regions and len(r) > 0:
                                region_label = r[0].value
                            else:
                                # region_label = 0
                                continue
                            v_name = mx_name.replace(' ', '_')
                            print(mafchrom, refstart, refend, region_label,
                                  v_name, data)
                            break