Beispiel #1
0
    def build_fwtrack(self, fhd):
        """Build FWTrackI from all lines, return a FWTrackI object.

        Note: All ranges will be merged (exclude the same
        range) then sorted after the track is built.

        If both_strand is True, it will store strand information in
        FWTrackI object.

        if do_merge is False, it will not merge the same range after
        the track is built.
        """
        fwtrack = FWTrackI()
        i = 0
        m = 0
        for thisline in fhd:
            (chromosome, fpos, strand) = self.__fw_parse_line(thisline)
            i += 1
            if i == 1000000:
                m += 1
                logging.info(" %d" % (m * 1000000))
                i = 0
            if fpos == None or not chromosome:
                continue
            fwtrack.add_loc(chromosome, fpos, strand)
        return fwtrack
Beispiel #2
0
    def build_fwtrack(self, lfhd, rfhd, dist=200):
        """Build FWTrackI from all lines, return a FWTrackI object.

        lfhd: the filehandler for left tag file
        rfhd: the filehandler for right tag file
        dist: the best distance between two tags in a pair

        The score system for pairing two tags:

        score = abs(abs(rtag-ltag)-200)+error4lefttag+error4righttag

        the smaller score the better pairing. If the score for a
        pairing is bigger than 200, this pairing will be discarded.

        Note only the best pair is kept. If there are over two best
        pairings, this pair of left and right tags will be discarded.

        Note, the orders in left tag file and right tag file must
        match, i.e., the Nth left tag must has the same name as the
        Nth right tag.

        Note, remove comment lines beforehand.
        """
        fwtrack = FWTrackI()
        i = 0
        m = 0
        lnext = lfhd.next
        rnext = rfhd.next
        self.dist = dist
        try:
            while 1:
                lline = lnext()
                rline = rnext()
                (chromosome, fpos, strand) = self.__fw_parse_line(lline, rline)
                i += 1
                if i == 1000000:
                    m += 1
                    logging.info(" %d" % (m * 1000000))
                    i = 0
                if not fpos or not chromosome:
                    continue
                fwtrack.add_loc(chromosome, fpos, strand)
        except StopIteration:
            pass
        return fwtrack
Beispiel #3
0
    def build_fwtrack(self, fhd):
        """Build FWTrackI from all lines, return a FWTrackI object.

        Note only the unique match for a tag is kept.
        """
        fwtrack = FWTrackI()
        i = 0
        m = 0
        for thisline in fhd:
            (chromosome, fpos, strand) = self.__fw_parse_line(thisline)
            i += 1
            if i == 1000000:
                m += 1
                logging.info(" %d" % (m * 1000000))
                i = 0
            if not fpos or not chromosome:
                continue
            fwtrack.add_loc(chromosome, fpos, strand)
        return fwtrack
Beispiel #4
0
def read_motif2 (motif_fhd,species,cutoff=0):
    """Read motif scan result, and return a WigTrackI object
    containing the motif locations.

    * If the motif scan data file is not big, use this function to
      load the whole file into memory. It may be faster than
      read_motif().

    motif_fhd : a file handler for binary motif scan result
    species   : must be "mm8" for mouse or "hg18" for human
    cutoff    : cutoff for the motif scan score
    """
    motif_range_list = FWTrackI(fw=0)
    if species == "hg18":
        chromosomes_fp = {                     # store start and number of file-pos for every chromosome in bin file
            "chr1":[0,0],"chr2":[0,0],"chr3":[0,0],
            "chr4":[0,0],"chr5":[0,0],"chr6":[0,0],
            "chr7":[0,0],"chr8":[0,0],"chr9":[0,0],
            "chr10":[0,0],"chr11":[0,0],"chr12":[0,0],
            "chr13":[0,0],"chr14":[0,0],"chr15":[0,0],
            "chr16":[0,0],"chr17":[0,0],"chr18":[0,0],
            "chr19":[0,0],"chr20":[0,0],"chr21":[0,0],
            "chr22":[0,0],"chrX":[0,0],"chrY":[0,0]
            }
        chromosomes = ["chr1","chr2","chr3","chr4","chr5","chr6",
                       "chr7","chr8","chr9","chr10","chr11","chr12",
                       "chr13","chr14","chr15","chr16","chr17","chr18",
                       "chr19","chr20","chr21","chr22","chrX","chrY"]
    elif species == "mm8":
        chromosomes_fp = {                     # store start and number of file-pos for every chromosome in bin file
            "chr1":[0,0],"chr2":[0,0],"chr3":[0,0],
            "chr4":[0,0],"chr5":[0,0],"chr6":[0,0],
            "chr7":[0,0],"chr8":[0,0],"chr9":[0,0],
            "chr10":[0,0],"chr11":[0,0],"chr12":[0,0],
            "chr13":[0,0],"chr14":[0,0],"chr15":[0,0],
            "chr16":[0,0],"chr17":[0,0],"chr18":[0,0],
            "chr19":[0,0],"chrX":[0,0],"chrY":[0,0]
            }
        chromosomes = ["chr1","chr2","chr3","chr4","chr5","chr6",
                       "chr7","chr8","chr9","chr10","chr11","chr12",
                       "chr13","chr14","chr15","chr16","chr17","chr18",
                       "chr19","chrX","chrY"]
    else:
        raise Exception("Only hg18/mm8 supported!")
        
    motif_fhd.seek(0)
    data = motif_fhd.read()
    # unpack the start pos
    p = 0
    for chromosome in chromosomes:
        chromosomes_fp[chromosome][0] = upk("<i",data[p:p+4])[0]
        p += 128

    # calculate number of hits
    total_motif_hits = 0
    for i in range(len(chromosomes)-1):
        mh = (chromosomes_fp[chromosomes[i+1]][0]-chromosomes_fp[chromosomes[i]][0])/8
        chromosomes_fp[chromosomes[i]][1] = mh
        total_motif_hits += mh
    # last one
    mh = (len(data)-chromosomes_fp[chromosomes[-1]][0])/8
    chromosomes_fp[chromosomes[-1]][1]=mh
    total_motif_hits += mh

    # read and write
    read_motif_hits = 0
    portion = 0
    p = 0

    n=0
    for chromosome in chromosomes:
        p = chromosomes_fp[chromosome][0]
        for i in range(chromosomes_fp[chromosome][1]):
            read_motif_hits += 1
            portion = float(read_motif_hits)/total_motif_hits
            if LOG:
                sys.stdout.write("\r  %.1f%% %s" % (portion*100,"#"*int(portion*50)))
                sys.stdout.flush()
            loc = upk("<i",data[p:p+4])[0]
            score = upk("<f",data[p+4:p+8])[0]
            p += 8
            if score < 0:
               strand = 1
               score = score*-1
            else:
               strand = 0
            #ofhd.write("%s\t%d\t%d\t%s_%s_%d\t%.2f\t%s\n" % (chromosome,loc-1,loc+motif_len-1,motif,chromosome,i,score,strand))
            if score > cutoff:
		#print score,cutoff
                n+=1
                motif_range_list.add_loc(chromosome,loc-1,strand)
            #print loc-1
    if LOG : sys.stdout.write("\n")
    data = None
    motif_range_list.merge_overlap()
    return motif_range_list