Exemple #1
0
def TransUnitIterator(handle, **kwargs):
    if type(handle) == type("s"):
        try:
            handle = open(handle, "r")
        except:
            raise ValueError("Can't open file %s" % handle)
    TU = TransUnit()
    for line in handle:
        line = line.strip()
        if len(line) == 0: continue
        if line[0] == "#": continue
        if line == "//" or line == "// ":
            yield TU
            TU = TransUnit()  #Reset
            continue
        x = line.split("\t")

        if x[1] == "OverlapGene:":
            gene = Bed(x[2:])
            TU.append_overlap_gene(gene)
        elif x[1] == "OverlapFeat:":
            feat = Bed(x[2:])
            TU.append_overlap_feat(feat)
        elif x[1] == "NearbyPromoter:":
            TU.append_promoter(x[2])
        elif x[1] == "Promoter Info:":
            TU.promoterInfo = x[2]
        else:
            TU.processHeader(x)
Exemple #2
0
def Main():
    args = ParseArg()

    if len(args.data) != len(args.name):
        print >> sys.stderr, "ERROR: Number of data is not the same as number of names!"
        sys.exit(0)

    # store data information
    data = {}
    total_reads = {}
    for i in range(len(args.data)):
        temp_name = args.name[i]
        print >> sys.stderr, "\n Reading data file:" + temp_name + "..."
        total_reads[temp_name] = 0
        if args.format[i] == "bam":
            total_reads[temp_name] = reduce(lambda x, y: x + y, [
                int(l.rstrip('\n').split('\t')[2])
                for l in pysam.idxstats(args.data[i])
            ])
        else:
            Format = "bed"
            for b in TableIO.parse(args.data[i], Format):
                total_reads[temp_name] += 1
                if total_reads[temp_name] % 50000 == 0:
                    print >> sys.stderr, "  reading %d reads..\r" % (
                        total_reads[temp_name]),
        data[temp_name] = DBI.init(args.data[i], args.format[i])

    output = open(args.output, 'w')

    Input = open(args.input, 'r')
    lines = Input.read().split("\n")

    # header
    header = ["chr", "start", "end", "type", "name", "subtype", "count"
              ] + data.keys()
    print >> output, "\t".join(g + "_%d" % (f) for f in [1, 2]
                               for g in header) + "\tinteraction\tp-value"

    num = 0
    print >> sys.stderr, "Start process interactions:"
    for l in lines:
        if l.strip() == '': continue
        l = l.strip().split('\t')
        num = num + 1
        if l[0] == "chrM" or l[7] == "chrM": continue
        C1 = Bed([l[0], int(l[1]), int(l[2])])
        C2 = Bed([l[7], int(l[8]), int(l[9])])
        rpkm1 = "\t".join(
            str(f) for f in
            [RPKM(C1, data[n], total_reads[n], n) for n in data.keys()])
        rpkm2 = "\t".join(
            str(f) for f in
            [RPKM(C2, data[n], total_reads[n], n) for n in data.keys()])
        print >> output, "\t".join(
            str(f) for f in l[:7] + [rpkm1] + l[7:14] + [rpkm2, l[14], l[15]])
        if num % 1000 == 0:
            print >> sys.stderr, "  Output interaction: %d\r" % (num),
Exemple #3
0
def Main():
    global args
    args = ParseArg()

    # Index peak files
    global sp1, sp2
    sp1, sp2 = ReadHistones(args.bg)

    if len(sp1) != len(sp2) or len(sp1) != len(args.histone):
        print >> sys.stderr, "The number of histone marks must be identical for the two species. Provided histone mark names should match the peak files."
        sys.exit(204)

    fout = open(args.output, "w")

    # t0 = time()
    input_list = []
    with open(args.q_region[0], "r") as fbed1, open(args.q_region[1],
                                                    "r") as fbed2:
        while True:
            line1 = fbed1.readline().strip().split()
            line2 = fbed2.readline().strip().split()
            if len(line1) == 0:
                break
            if len(line2) == 0:
                break

            bed1 = Bed(line1)
            bed2 = Bed(line2)

            if bed1.chr != "chrX" and bed1.chr != "chrY" and (
                    not bed1.chr.lstrip("chr").isdigit()):
                continue
            if bed2.chr != "chrX" and bed2.chr != "chrY" and (
                    not bed2.chr.lstrip("chr").isdigit()):
                continue

            input_list.append((bed1, bed2))

    p = Pool(args.p_num)
    try:
        out_queue = p.map(Generate_output_str, input_list)
        p.close()
        p.join()
    except Exception as e:
        p.terminate()
        print >> sys.stderr, e.args
        sys.exit(e.args[0])

    # t1 = time()
    # print >>sys.stderr, "Time: " + str((t1 - t0) / 60)

    with open(args.output, "w") as fout:
        for item in out_queue:
            if item:
                print >> fout, item
Exemple #4
0
def Extend_liftPeaks(ori_dict, lift_dict, lift_back_peak):
    '''
  Extend the remapped peak region if it is shorter than 0.9 * original length
  when remapped back. Names contain $.
  ori_dict, lift_dict: dictionaries. Names in ori_dict have no $, whereas in lift_dict have $.
  lift_back_peak: a file with peak regions in species1 remapped back to species1.
  '''
    N = 0.9
    sp1_dict = OrderedDict()
    sp2_dict = OrderedDict()
    ori_to_lift = {}

    with open(lift_back_peak, "r") as fin:
        for line in fin:
            line = line.strip().split()
            peak_name = line[3].split("$")[0]
            back_bed = Bed(line)
            ori_bed = ori_dict[peak_name]
            lift_bed = lift_dict[back_bed.id]
            if Overlap(ori_bed, back_bed, ifStrand=True):
                # if no overlap, no need to proceed.
                ori_len = (ori_bed.stop - ori_bed.start)
                back_len = (back_bed.stop - back_bed.start)
                if float(back_len / ori_len) < N:
                    left_ext_len = max(0, back_bed.start - ori_bed.start)
                    right_ext_len = max(0, ori_bed.stop - back_bed.stop)

                    if lift_bed.strand == ori_bed.strand:
                        lift_new_start = max(1, lift_bed.start - left_ext_len)
                        lift_new_stop = lift_bed.stop + right_ext_len + 1
                    else:
                        lift_new_start = max(1, lift_bed.start - right_ext_len)
                        lift_new_stop = lift_bed.stop + left_ext_len + 1

                    sp1_dict[back_bed.id] = ori_bed
                    sp2_dict[back_bed.id] = Bed([
                        lift_bed.chr, lift_new_start, lift_new_stop,
                        lift_bed.id, lift_bed.score, lift_bed.strand
                    ])
                else:
                    sp1_dict[back_bed.id] = ori_bed
                    sp2_dict[back_bed.id] = lift_bed

                # add peak name and region name into ori_to_lift
                if peak_name not in ori_to_lift:
                    ori_to_lift[peak_name] = []
                ori_to_lift[peak_name].append(back_bed.id)

    return sp1_dict, sp2_dict
Exemple #5
0
    def Annotate(self, dbi1, dbi2, dbi3):  # do RNA annotation if not annotated
        """
        Update annotation.

        :param ref_allRNA: the `DBI.init <http://bam2xwiki.appspot.com/DBI>`_ object (from BAM2X) for bed6 file of all kinds of RNA
        :param ref_detail: the `DBI.init <http://bam2xwiki.appspot.com/DBI>`_ object for bed12 file of lincRNA and mRNA with intron, exon, UTR   
        :param ref_detail: the `DBI.init <http://bam2xwiki.appspot.com/DBI>`_ object for bed6 file of mouse repeat

        Example:
        
        >>> str="chr13\t40975747\t40975770\t+"
        >>> a=annotated_bed(str)
        >>> a.Cluster(3)
        >>> ref_allRNA=DBI.init("../../Data/all_RNAs-rRNA_repeat.txt.gz","bed")
        >>> ref_detail=DBI.init("../../Data/Ensembl_mm9.genebed.gz","bed")
        >>> ref_repeat=DBI.init("../../Data/mouse.repeat.txt.gz","bed")
        >>> a.Annotate(ref_allRNA,ref_detail,ref_repeat)
        >>> print a
        "chr13\t40975747\t40975770\tprotein_coding\tgcnt2\tintron\t3"
        """
        if not self.annotated:
            if "chr" in self.chr:
                bed = Bed([self.chr, self.start, self.end])
                [self.type, self.name, self.subtype,
                 self.proper] = annotation(bed, dbi1, dbi2, dbi3)
            self.annotated = True
Exemple #6
0
	def read(self,handle):
	    '''
	    Read list  or iterator into BinIndex Data Structure
	    Usage:
	    Example:
	        f=TableIO.parse("filename","genebed")
	        data=binindex(f)
	    Example:
	        f=TableIO.parse("filename","vcf")
                data=binindex(f)
                
                equals to

	        f=TableIO.parse("filename","vcf")
                data=binindex()
                data.read(f)
	    Example:
	        data.read(bedlist)
	    '''
	    
	    for i in handle:
	        a=i
	        if type(i)==type([]) or type(i)==type((1,2,3)):
	            a=Bed(i)
	        self.append(a)
Exemple #7
0
def find_nearest(bed, dbi, extends=50000, **dict):
    start = bed.start - extends
    stop = bed.stop + extends
    chr = bed.chr
    if start < 0: start = 0
    new_bed = Bed([chr, start, stop])

    results = dbi.query(new_bed, **dict)
    d = 2 * extends
    flag = 0

    for result in results:
        if distance(bed, result) < d:
            d = distance(bed, result)
            nearest = result
            if result.strand == "." or bed.strand == ".":
                strand = "."
            elif result.strand == bed.strand:
                strand = "+"
            else:
                strand = "-"

            flag = 1
    if flag == 0:
        return (None, None, None)
    else:
        return (d, nearest, strand)
Exemple #8
0
def BamToBedIterator(filename, **kwargs):
    '''
    iterator for reading a bam file, yield Bed Object instead of pysam.AlignedRead Object 
    Usage:
        from xplib.TableIO.BamIO import BamToBedIterator
        for read in BamToBedIterator(filename):
            print read
    read is a alignment in pysam.AlignedRead format.
    Wrapper In TableIO.parse(filename,"bam2bed")
    Usage:
        for i in TableIO.parse(filename,"bam2bed"):
            print i

    A simple bam2bed.py which will read bam file and print aligned read in bed format:
        import sys
        from xplib import TableIO
        filename=sys.args[1]
        for i in TableIO.parse(filename,"bam2bed"):
            print i
    '''

    f = pysam.Samfile(filename, "rb")
    for i in f:
        if i.tid < 0: continue
        strand = "+"
        if i.is_reverse:
            strand = "-"
        score = i.mapq
        bed = Bed([f.references[i.tid], i.pos, i.aend, i.qname, score, strand])
        yield bed
Exemple #9
0
def Main():
  global args
  args=ParseArg()

  print >>sys.stderr, "Indexing..."
  global sp1, sp2
  sp1, sp2=ReadHistones(args.bg)
  print >>sys.stderr, "Done indexing"
  if len(sp1)!=len(sp2) or len(sp1)!=len(args.histone):
    print >>sys.stderr, "Check the number of histone modifications!"
    exit(0)

  fout=open(args.output,"w")


  input_list=[]
  with open(args.q_region[0],"r") as fbed1, open(args.q_region[1],"r") as fbed2:
    while True:
      line1=fbed1.readline().strip().split("\t")
      line2=fbed2.readline().strip().split("\t")
      if line1[0]=="":
        break
      if line2[0]=="":
        break

      bed1=Bed(line1)
      bed2=Bed(line2)
      if bed1.chr!="chrX" and bed1.chr!="chrY":
        try:
          int(bed1.chr.lstrip("chr"))
        except:
          continue
      if bed2.chr!="chrX" and bed2.chr!="chrY":
        try:
          int(bed2.chr.lstrip("chr"))
        except:
          continue

      input_list.append((bed1, bed2))

  p=Pool(args.p_num)
  out_queue=p.map(Generate_output_str,input_list)

  with open(args.output,"w") as fout:
    for item in out_queue:
      if item:
        print >>fout, item
Exemple #10
0
def test():
    if len(sys.argv) == 1:
        print >> sys.stderr, "Usage: Utils.py file.bed"
        exit()
    a = TableIO.parse(sys.argv[1], 'bed')
    data = readIntoBinIndex(a)
    bed = Bed(["chr1", 100000, 200000, ".", ".", "."])
    g = getOverlapFeatures(bed, data)
    print "Overlap with", bed
    for i in g:
        print i
Exemple #11
0
def Construct_dict(fname):
    peak_dict = OrderedDict()
    with open(fname, "r") as fin:
        for line in fin:
            line = line.strip().split()
            bed = Bed(line)
            if bed.id in peak_dict:
                print >> sys.stderr, "Duplicated names: " + bed.id
                sys.exit()
            peak_dict[bed.id] = bed
    return peak_dict
Exemple #12
0
def parse_string_to_bed(string):
    x = string.split(":")
    if len(x) != 2:
        print >> sys.stderr, "String Format should be\n chromsome:start-end"
        exit(1)
    chr = x[0]
    y = x[1].split("-")
    if len(y) != 2:
        print >> sys.stderr, "String Format should be\n chromsome:start-end"
        exit(1)
    start = int(y[0]) - 1
    end = int(y[1])
    return Bed([chr, start, end])
Exemple #13
0
def BedIterator(handle):
    if type(handle) == type("s"):
        try:
            handle = open(handle, "r")
        except:
            raise ValueError("Can't open file %s" % handle)
    for line in handle:
        line = line.strip()
        if line[0] == "#": continue
        x = line.split("\t")
        b = Bed(x)
        yield b
    return
Exemple #14
0
def Main():
    args = ParseArg()

    anno = DBI.init(args.annotation, "bed")
    ext_dis = args.ext_dis
    target_num = args.target_num

    with open(args.input, "r") as fin, open(args.output, "w") as fout:
        for line in fin:
            bed_region = Bed(line.strip().split())
            mid_point = (bed_region.start + bed_region.stop) / 2
            ori_start = bed_region.start
            ori_stop = bed_region.stop
            bed_region.start = mid_point - ext_dis
            bed_region.stop = mid_point + ext_dis

            gene_list = findNearbyGene(bed_region, anno, ori_start, ori_stop,
                                       target_num)
            for gene in gene_list:
                print >> fout, "\t".join(
                    [bed_region.id, gene[1],
                     str(gene[0])])
Exemple #15
0
def test():
    if len(sys.argv) == 1:
        print >> sys.stderr, "Usage: Utils.py file.bed"
        exit()
    a = TableIO.parse(sys.argv[1], 'genebed')
    data = readIntoBinIndex(a)
    bed = Bed(["chr12", 54380000, 54392000, "HOXC", 0, "+"])
    g = getOverlapFeatures(bed, data)
    Overlap_dict = Classify_Overlap(bed, g)
    overlap_string = ''
    for k, v in Overlap_dict.iteritems():
        if v:
            overlap_string += "".join(
                [str(k + '_' + each) + ';' for each in v])
    print bed, overlap_string
Exemple #16
0
def iterOverlapFeature(bed, data):
    '''
    iterator the bed overlap features in data
    Usage:
        from xplib.Annotation.Utils import *
        for i in iterOverlapFeature(bed,data):
            print i
    '''
    if type(bed) == type((1, 2, 3)) or type(bed) == ([1, 2, 3]):
        bed = Bed(bed[0:3])  # guess (chrome,start,stop)
    if not data.has_key(bed.chr):
        raise StopIteration
    D = data[bed.chr]
    for bin in iterRangeOverlapBins(bed.start, bed.stop):
        for f in D[bin]:
            if f.start < bed.stop and f.stop > bed.start:
                yield f
    raise StopIteration
def Main():
    args = ParseArg()

    #store bed files with indexing and count information:
    bed = {}

    print >> sys.stderr, "Starting index bed files:"
    for i in range(len(args.beds)):
        temp_name = args.name[i]
        print >> sys.stderr, "  #Indexing for bed file of", temp_name, "\r",
        bed[temp_name] = DBI.init(args.beds[i], 'bed')

    half_len = int(args.len)
    print >> sys.stderr
    print >> sys.stderr, "Reading nucleosome peak xls file from Danpos."
    nucleosomes = TableIO.parse(args.nucleosome, 'metabed', header=True)

    print >> sys.stderr, "Start Counting..."
    count_matrix = []

    out = open(args.output, "w")
    line_head = open(args.nucleosome, 'r').readline().strip()
    line_head = line_head + "\t" + "\t".join(str(f) for f in args.name)
    print >> out, line_head
    for i in nucleosomes:
        chrom = i.chr

        if chrom == 'chrY' or chrom == 'chrX' or chrom == 'chrM':
            continue
        center = int(i.start + i.end) / 2
        count = np.zeros(len(args.beds), dtype="float")
        line = str(i)
        for k, name in enumerate(bed.keys()):
            for j in bed[name].query(
                    Bed([
                        chrom, center - ma - (half_len - 75),
                        center + ma + (half_len - 75)
                    ])):
                j_center = find_center(j, half_len)
                weight = max(min(1, (ma - abs(j_center - center)) / 25.0), 0)
                count[k] += weight
        line = line + "\t" + "\t".join(str(f) for f in count)
        print >> out, line
        count_matrix.append(count)
Exemple #18
0
def compare(isoforms):
    global reads_set,isoforms_set,selected_reads_set,selected_isoforms_set
    isoforms_set=[]
    selected_isoforms_set=[]
    chr=isoforms[0].chr
    min_start=isoforms[0].start
    max_stop=isoforms[0].stop
    for i in isoforms:
        if i.start < min_start: min_start=i.start
        if i.stop > max_stop: max_stop=i.stop
        isoforms_set.append(i)
#        print >>sys.stderr,"debug",i
    transcript_region=Bed([chr,min_start,max_stop]);
    print >>out,"REGION\t",chr,"\t",min_start,"\t",max_stop
    print >>out,"ISOFORM_INPUT_NUMBER\t",len(isoforms_set)

    
    '''
    reading all the reads in this transcript region
    '''
    reads_set=[]
    selected_reads_set=[]
    reads_num=0
    for i in dbi.query(transcript_region,method="fetch12"):
        reads_set.append(i)
        reads_num+=1
#        print >>sys.stderr,"debug",i


    '''
    compare two sets
    '''
    if reads_num==0:
        print >>out,"SUMMARY\t0 / 0 (0.0%) reads were generated from", len(selected_isoforms_set), "isoforms"
    else:
        while( float(len(selected_reads_set))/reads_num < 0.99):
            no_more_isoform=find_max_compatible()
            if no_more_isoform: break
        print >>out,"SUMMARY\t",len(selected_reads_set) ,"/", reads_num ,
        ratio=float(len(selected_reads_set))/reads_num,
        print >>out,"(%.4f)"%ratio,
        print >>out," reads were generated from ",len(selected_isoforms_set)," isoforms"
    print >>out,"//"
Exemple #19
0
def readIntoBinIndex(handle):
    '''
    Read list  or iterator into BinIndex Data Structure
    Usage:
    Example:
        f=TableIO.parse("filename","genebed")
        data=readIntoBinIndex(f)
    Example:
        data=readIntoBinIndex(bedlist)
    '''
    data = {}

    for i in handle:
        a = i
        if type(i) == type([]) or type(i) == type((1, 2, 3)):
            a = Bed(i)
        if not data.has_key(a.chr):
            data[a.chr] = [[] for row in range(4096 + 512 + 64 + 8 + 1)]
        bin = binFromRangeStandard(a.start, a.stop)
        data[a.chr][bin].append(i)
    return data
Exemple #20
0
def test():
    if len(sys.argv)==1:
        print >>sys.stderr,"Usage: binindex.py file.bed"
        exit()
    a=TableIO.parse(sys.argv[1],'bed')
    data=binindex(a)
    data2=binindex()
    bed=Bed(["chr1",100000,2000000,".",".","."])
    for i in data.query(bed):
        print "before remove:",len(data)
        data.remove(i)
        print "after remove:",len(data)
        data2.append(i)
        print data2
    print "data finalize:"
    data.merge(data2)
    print data
    print data+data2
    print data
    print data.uniq()
    print data
Exemple #21
0
	def query(self,bed,**kwargs):
	    '''
	    iterator the bed overlap features in data
	    Usage:
                data=bedindex(file.bed)
	        for i in data.query(bed):
	            print i
	
	    **kwargs for further extension for annotation with no pre define format
	    '''
	
	    if type(bed)==type((1,2,3)) or type(bed)==([1,2,3]):
	        bed=Bed(bed[0:3])  # guess (chrome,start,stop)
	    if not self.data.has_key(bed.chr):
	         raise StopIteration 
	    D=self.data[bed.chr]
	    for bin in binindex.iter_range_overlap_bins(bed.start,bed.stop):
	        for f in D[bin]:
	            if f.start < bed.stop and f.stop > bed.start:
	                yield f
	    raise StopIteration
Exemple #22
0
def readIntoBinIndex(handle):
    '''
    Read list  or iterator into BinIndex Data Structure
    Usage:
    Example:
        f=TableIO.parse("filename","genebed")
        data=readIntoBinIndex(f)
    Example:
        f=TableIO.parse("filename","vcf")
        data=readIntoBinIndex(f)
    Example:
        data=readIntoBinIndex(bedlist)
    '''
    data = {}

    for i in handle:
        a = i
        if type(i) == type([]) or type(i) == type((1, 2, 3)):
            a = Bed(i)
        appendIntoBinIndex(data, a)
    return data
def GetAnnotationName(pAnno, hasAnno, dbi, hasRepeat, dbirepeat):
    if pAnno.source == "genome":
        if hasAnno:
            bed = Bed([pAnno.chr, pAnno.start, pAnno.end, '.', 0.0, '+'])
            for hit in dbi.query(bed):
                name_component = hit.id.split(".", 2)[2]
                id_component = hit.id.split(".", 2)[1]
                if pAnno.name == name_component or pAnno.name.split(
                        "_", 2)[0] == id_component:
                    pAnno.id = id_component
                    return ":".join(
                        str(f) for f in [pAnno.type, pAnno.id, pAnno.proper])

        if hasRepeat:
            for hit in dbirepeat.query(bed):
                tempname = hit.id.split("&")
                if pAnno.name == tempname[0]:
                    pAnno.id = "".join(
                        str(f) for f in [pAnno.chr, pAnno.name, hit.start])
                    return ":".join(
                        str(f) for f in [pAnno.type, pAnno.id, pAnno.proper])
        if pAnno.type == 'tRNA':
            return ":".join(
                str(f) for f in [pAnno.type, pAnno.name, pAnno.proper])
        if hasAnno or hasRepeat:
            raise Exception('pAnno not found! ' + pAnno.chr + ':' + pAnno.name)
        pAnno.id = "".join(
            str(f) for f in [pAnno.type, pAnno.chr, pAnno.name, pAnno.start])
    elif ".fa" in pAnno.source:
        pAnno.id = pAnno.chr.split("_", 1)[0]
        try:
            pAnno.chr = pAnno.chr.split("_", 1)[1]
        except:
            pass
    else:
        pAnno.id = pAnno.name
    return pAnno.type + ":" + pAnno.id + ":" + pAnno.proper
Exemple #24
0
def genome_annotation(outputbam,
                      annotationfile,
                      detail,
                      annotationRepeat,
                      mapq_thred,
                      strandenforced=False,
                      posstrand=True,
                      requireUnique=False,
                      results_dict=dict()):
    # annotationfile is annotation file
    # detail is db_detail file

    if annotationfile:
        dbi1 = DBI.init(annotationfile, "bed")
        dbi2 = DBI.init(detail, "bed")
        dbi3 = DBI.init(annotationRepeat, "bed")

    newdict = dict()
    #    funmap = open(unmapfilename, 'w')

    for record in outputbam:
        # print >> sys.stderr, record.qname
        if "N" not in record.cigarstring:
            anno_start = record.pos
            anno_end = record.aend
            bed_start = record.pos
            bed_end = record.aend
        else:
            bed_list, anno_start, anno_end = Exon_junction(record)
            bed_start = ",".join([str(f[0]) for f in bed_list])
            bed_end = ",".join([str(f[1]) for f in bed_list])


#        print anno_start,anno_end,bed_start,bed_end
        IsMapped = False

        if Included(record, requireUnique, mapq_thred):
            strandactual = ("+" if posstrand else "-")
            strand = "+"
            if record.is_reverse:
                strandactual = ("-" if posstrand else "+")
                strand = "-"
            if annotationfile:
                bed = Bed([
                    outputbam.getrname(record.tid), anno_start, anno_end, '.',
                    0.0, strandactual
                ])
                [typ, name, subtype,
                 strandcol] = annotation(bed, dbi1, dbi2, dbi3)
                if (not strandenforced) or strandcol == 'ProperStrand':
                    curr_anno_arr = (str(f) for f in [
                        outputbam.getrname(
                            record.tid), bed_start, bed_end, strand,
                        record.seq, 'genome', typ, name, subtype, strandcol
                    ])
                    if not record.qname in newdict:
                        newdict[record.qname] = '\t'.join(curr_anno_arr)
                        if not Included(record, True, mapq_thred):
                            # not unique
                            newdict[record.qname] = [newdict[record.qname]]
                    else:
                        if type(newdict[record.qname]) is str:
                            newdict[record.qname] = [newdict[record.qname]]
                        newdict[record.qname].append('\t'.join(curr_anno_arr))
                    IsMapped = True
            else:
                strandcol = '.'
                curr_anno_arr = (str(f) for f in [
                    outputbam.getrname(record.tid), record.aend - record.alen +
                    1, record.aend, strand, record.seq, 'genome', strandcol
                ])
                if not record.qname in newdict:
                    newdict[record.qname] = '\t'.join(curr_anno_arr)
                    if not Included(record, True, mapq_thred):
                        # not unique
                        newdict[record.qname] = [newdict[record.qname]]
                else:
                    if type(newdict[record.qname]) is str:
                        newdict[record.qname] = [newdict[record.qname]]
                    newdict[record.qname].append('\t'.join(curr_anno_arr))
                IsMapped = True

    newanno = dict(results_dict.items() + newdict.items())
    return newanno
Exemple #25
0
def compare_reads(isoforms):
    # global dbi,out
    isoforms_set=[]
    chr=isoforms[0].chr
    min_start=isoforms[0].start
    max_stop=isoforms[0].stop
    for i in isoforms:
        if i.start < min_start: min_start=i.start
        if i.stop > max_stop: max_stop=i.stop
        isoforms_set.append(i)
#        print >>sys.stderr,"debug",i
    transcript_region=Bed([chr,min_start,max_stop]);
    print >>out,"REGION\t",chr,"\t",min_start,"\t",max_stop
    print >>out,"ISOFORM_INPUT_NUMBER\t",len(isoforms_set)

    
    '''
    reading all the reads in this transcript region
    '''
    reads_set=[]
    reads_num=0
    for i in dbi.query(transcript_region,method="fetch12"):
        reads_set.append(i)
        reads_num+=1


    '''
    compare two sets
    '''
    l=len(isoforms_set)
    bincodes={}
    total=reads_num
    if total==0: total=0.001
    for i in reads_set:
        bincode=0
        for j in isoforms_set:
            k=Tools.compatible_with_transcript(i,j)
            if k:
                bincode = (bincode<<1)+1
            else:
                bincode = bincode<<1
        if  bincodes.has_key(bincode):
            bincodes[bincode]+=1
        else:
            bincodes[bincode]=1
        
    
    init=[ 1.0/l for i in range(l) ]
    proportion=init
    '''
    EM Initialize
    '''

    '''
    E step
    '''
    totals=[0.0 for i in range(l)]
    new_proportion=[0.0 for i in range(l)]
    iterate_time=0;
    while(1):
        totals=[0.0 for i in range(l)]
        for code in bincodes.keys():
            row_total=0.0
            for j in range(l):
                if get_bit_n(j,l,code):
                    row_total+=proportion[j]
            for j in range(l):
                if get_bit_n(j,l,code):
                    totals[j]+=bincodes[code] * proportion[j] / row_total

        for i in range(l):
            new_proportion[i]=totals[i]/total
        '''
        M step
        '''
        #print >>sys.stderr,"proportion",proportion #debug
        #print >>sys.stderr,"new_proportion",new_proportion #debug
        #print >>sys.stderr,"total",total #debug
        #print >>sys.stderr,"totals",totals #debug
        dis=distance(proportion,new_proportion)
        proportion=new_proportion
        iterate_time+=1
        if(dis<1e-05): break;
        if(args.BYY and iterate_time > 10): break;
   
    '''
    BYY Hard Cut Algorithm
    '''
    while(args.BYY):
        totals=[0.0 for i in range(l)]
        for code in bincodes.keys():
            maxj=-1
            for j in range(l):
                if get_bit_n(j,l,code):
                   # totals[j]+=bincodes[code] * proportion[j] / row_total
                   if maxj==-1: maxj=j
                   elif proportion[j] > proportion[maxj]: maxj=j
            if maxj!=-1: totals[maxj]+=bincodes[code]
        #new_proportion=[0.0 for i in range(l)]
        for i in range(l):
            new_proportion[i]=totals[i]/total
        '''
        M step
        '''
        dis=distance(proportion,new_proportion)
        if(dis<1e-05): break;
        proportion=new_proportion
    # print >>out,proportion

    '''
    print isoforms
    '''
    for i,x in enumerate(isoforms_set):
        if proportion[i] > args.threshold:  
            if x.score==0.0:
                x.score=proportion[i]
                print >>out,"HT\t",x,"\t",proportion[i]
            else:
                print >>out,"HT\t",x,"\t",proportion[i]
        else:
            if not args.hits_only:
                if x.score==0.0:
                    x.score=proportion[i]
                    print >>out,"NT\t",x,"\t",proportion[i]
                else:
                    print >>out,"NT\t",x,"\t",proportion[i]
    print >>out,"//"
Exemple #26
0
def randRegion(length, RNAs):
    '''  get random sequences coordinate given desire seq length and a database of all RNAs'''
    Type = selectType(length)
    RNA = random.choice(RNAs[Type])
    start = random.randrange(RNA.start, max(RNA.stop - length, RNA.start) + 1)
    return Bed([RNA.chr, start, start + length], strand=RNA.strand), Type
Exemple #27
0
def genome_annotation(outputbam, annotationfile, detail, readfilename, unmapfilename, strandenforced = False, posstrand = True, requireUnique = False, results_dict = dict()):
    # annotationfile is annotation file
    # detail is db_detail file

    if annotationfile:
        dbi1=DBI.init(annotationfile,"bed")
        dbi2=DBI.init(detail,"bed")
        dbi3=DBI.init("/home/yu68/bharat-interaction/new_lincRNA_data/mouse.repeat.txt","bed")
    
    newdict = dict()
    funmap = open(unmapfilename, 'w')

    for record in outputbam:
        # print >> sys.stderr, record.qname
        IsMapped = False

        if Included(record, requireUnique):
            strandactual = ("+" if posstrand else "-")
            strand = "+"
            if record.is_reverse:
                strandactual = ("-" if posstrand else "+")
                strand = "-"
            if annotationfile:
                bed=Bed([outputbam.getrname(record.tid), record.pos, record.aend,'.',0.0,strandactual])
                [typ, name, subtype, strandcol] = annotation(bed,dbi1,dbi2,dbi3)
                if (not strandenforced) or strandcol == 'ProperStrand':
                    curr_anno_arr = (str(f) for f in [outputbam.getrname(record.tid), record.pos, record.aend, strand, record.seq, 'genome', typ, name, subtype, strandcol])
                    if not record.qname in newdict:
                        newdict[record.qname] = '\t'.join(curr_anno_arr)
                        if not Included(record, True):
                            # not unique
                            newdict[record.qname] = [newdict[record.qname]]
                    else:
                        if type(newdict[record.qname]) is str:
                            newdict[record.qname] = [newdict[record.qname]]
                        newdict[record.qname].append('\t'.join(curr_anno_arr))
                    IsMapped = True
            else:
                strandcol = '.'
                curr_anno_arr = (str(f) for f in [outputbam.getrname(record.tid), record.aend - record.alen + 1, record.aend, strand, record.seq, 'genome', strandcol])
                if not record.qname in newdict:
                    newdict[record.qname] = '\t'.join(curr_anno_arr)
                    if not Included(record, True):
                        # not unique
                        newdict[record.qname] = [newdict[record.qname]]
                else:
                    if type(newdict[record.qname]) is str:
                        newdict[record.qname] = [newdict[record.qname]]
                    newdict[record.qname].append('\t'.join(curr_anno_arr))
                IsMapped = True

        if not IsMapped:
            # output all pairs that cannot be mapped on both sides as unmaped pairs into two fasta file
            seq = record.seq
            if record.is_reverse:
                seq = revcomp(record.seq, rev_table)
            unmap_rec = SeqRecord(Seq(seq, IUPAC.unambiguous_dna), id = record.qname, description='')
            SeqIO.write(unmap_rec, funmap, "fasta")
    
    funmap.close()
    
    newanno = dict(results_dict.items() + newdict.items())
    return newanno
Exemple #28
0
def Main():
    '''
    IO TEMPLATE
    '''
    global args,out
    args=ParseArg()
    fin=IO.fopen(args.input,"r")
    out=IO.fopen(args.output,"w")
    '''
    END OF IO TEMPLATE 
    '''
    print >>out,"# This data was generated by program ",sys.argv[0]," (version: %s)"%VERSION,
    print >>out,"in bam2x ( https://github.com/nimezhu/bam2x )"
    print >>out,"# Date: ",time.asctime()
    print >>out,"# The command line is :"
    print >>out,"#\t"," ".join(sys.argv)
    
    hSites={};
    donorSites={};
    acceptorSites={}
    if args.genome is not None:
        genome=DBI.init(args.genome,"genome")
    else:
        genome=None

    j=0
    for j,i in enumerate(TableIO.parse(fin,"bam2bed12",references=fin.references,strand=args.strand)):
        #print >>out,i
        if j%1000==0: print >>sys.stderr,"processing ",j,"reads               \r",
        for intron in i.Introns():
            if len(intron)< args.intron_min_length: continue
            donor=intron.head();
            #print >>sys.stderr,intron
            #print >>sys.stderr,donor
            donorID=bedToID(donor)
            if(donorSites.has_key(donorID)):
                donorSites[donorID]+=1
            else:
                donorSites[donorID]=1
            acceptor=intron.tail();
            acceptorID=bedToID(acceptor)
            if(acceptorSites.has_key(acceptorID)):
                acceptorSites[acceptorID]+=1
            else:
                acceptorSites[acceptorID]=1
            ''' 
            if genome is not None:
                
                s=genome.query(intron.head()).upper()+".."+genome.query(intron.tail()).upper()
                if hSites.has_key(s):
                    hSites[s]+=1
                else:
                    hSites[s]=1
            '''
    donors=[]
    for key in donorSites.keys():
        a=key.split("\t")
        donors.append(Bed([a[0],a[1],a[2],"noname_donor",donorSites[key],a[3]]));
    donors.sort()
    for i,x in enumerate(donors):
        x.id="donor_"+str(i)
        print >>out,x,"\t",genome.query(x).upper() 

    acceptors=[]
    for key in acceptorSites.keys():
        a=key.split("\t")
        acceptors.append(Bed([a[0],a[1],a[2],"noname_acceptor",acceptorSites[key],a[3]]));
    acceptors.sort()
    for i,x in enumerate(acceptors):
        x.id="acceptor_"+str(i)
        print >>out,x,"\t",genome.query(x).upper()
Exemple #29
0
def Main():
    args=ParseArg()

    #store bed files with indexing and count information:
    bam={}

    print >>sys.stderr,"Starting index bed files:"
    for i in range(len(args.bams)):
        temp_name=args.name[i]
        print >>sys.stderr,"  #Indexing for bed file of",temp_name,"\r",
        bam[temp_name]=DBI.init(args.bams[i],args.fmt)

    print >>sys.stderr
    print >>sys.stderr,"Reading nucleosome peak xls file from Danpos."
    nucleosomes=TableIO.parse(args.nucleosome,'metabed',header=True)

    print >>sys.stderr,"Initial output files..."

    out=open(args.output,"w")
    # -- for verbose ---
    if args.verbose:
        out_mark=[]
        for n in args.name:
            out_mark.append(open(n+'_shift_nucleosomes.bed','w'))
    # ------------------ 
    line_head=open(args.nucleosome,'r').readline().strip()
    line_head=line_head+"\t"+"\t".join(str(f) for f in args.name)+'\t'+"\t".join(str(f)+'_off' for f in args.name)
    print >>out,line_head
    
    print >>sys.stderr,"Start Counting..."
    num=0
    t0 = time()
    for i in nucleosomes:
        chrom=i.chr
        if i.smt_pval>0.01 or i.fuzziness_pval>0.01: continue # only choose nucleosomes with high value and low fuzziness   
        if chrom == 'chrY' or chrom == 'chrX' or chrom == 'chrM':
            continue
        num=num+1
        center=int(i.start+i.end)/2
        count=np.zeros(len(args.bams),dtype="float")
        offset=np.zeros(len(args.bams),dtype='int')
        line=str(i)
        for k,name in enumerate(bam.keys()):
            if args.fmt=='bam':
                query=bam[name].query(Bed([chrom,center-ma-(half_len-75)-rangeS,center+ma+(half_len-75)+rangeS]),method='fetch')
            else:
                query=bam[name].query(Bed([chrom,center-ma-(half_len-75)-rangeS,center+ma+(half_len-75)+rangeS]))
            read_centers=[]
            for j in query:
                read_centers.append(find_center(j,args.fmt))
            [o,c]=getCount(read_centers,center)
            count[k]=c
            offset[k]=o
            # -- for verbose ---
            if args.verbose:
                print >>out_mark[k],chrom+'\t%d\t%d'%(i.start+o,i.end+o)
            # ------------------
        line = line + "\t" + "\t".join(str(f) for f in count) + '\t' + "\t".join(str(f) for f in offset)
        if num%20000==0:
            t1 = time()
            print >>sys.stderr,"processing %dth nucleosome..., time: %.2fs."%(num,t1-t0),'\r',
            t0 = time()    
        print >>out,line
    print
    out.close()
    
    # -- for verbose ---
    if args.verbose:
        for k in out_mark:
            k.close()