Exemple #1
0
    def build_fwtrack_v2(self, teIdx):
        """Build FWTrackII from all lines, return a FWTrackII object.

        Note only the unique match for a tag is kept.
        """
        fwtrack = FWTrackII(filename=self.__srcfile)
        fseek = self.fhd.seek
        fread = self.fhd.read
        ftell = self.fhd.tell
        references = []
        # move to pos 4, get the length of header
        fseek(4)
        header_len = struct.unpack('<i', fread(4))[0]
        fseek(header_len + ftell())
        # get the number of chromosome
        nc = struct.unpack('<i', fread(4))[0]

        for x in range(nc):
            # read each chromosome name
            nlength = struct.unpack('<i', fread(4))[0]
            chrname = fread(nlength)[:-1]
            references.append(chrname)
            # jump over chromosome size, we don't need it
            fseek(ftell() + 4)

        i = 0
        m = 0
        multi_reads = []
        prev_seq = ""
        seq_len = 0
        seq_len_count = 0
        while 1:
            try:
                entrylength = struct.unpack('<i', fread(4))[0]
            except struct.error:

                break
            (seq_name, chrid, fpos, flen, strand,
             qual) = self.__binary_parse(fread(entrylength))
            if seq_len_count < 1000:
                seq_len += flen
                seq_len_count += 1

            i += 1
            if i == 1000000:
                m += 1
                logging.info(" %d" % (m * 1000000))
                i = 0

            if fpos >= 0:
                if seq_name == prev_seq:  #multi reads
                    r = Read()
                    r.chrom = references[chrid]
                    if strand == 1:
                        r.strand = 1
                        r.start = fpos - flen
                        r.end = fpos
                    else:
                        r.start = fpos
                        r.end = fpos + flen
                        r.strand = 0
                    r.name = seq_name
                    r.weight = 0
                    r.qual = qual

                    multi_reads.append(r)
                else:
                    if prev_seq != "":
                        (sel_reads, w) = self.sameFam(multi_reads, teIdx)
                        # w = 1.0/len(multi_reads)
                        for k in range(len(sel_reads)):
                            rr = sel_reads[k]
                            #w = weights[k]
                            # w = round(1.0/len(multi_reads),2)
                            #for k in range(len(multi_reads)) :
                            #   rr = multi_reads[k]
                            fwtrack.add_loc(rr.chrom, rr.start, rr.strand, w)

                    multi_reads = []
                    prev_seq = seq_name
                    r = Read()
                    r.chrom = references[chrid]
                    if strand == 1:
                        r.strand = 1
                        r.start = fpos - flen
                        r.end = fpos
                    else:
                        r.start = fpos
                        r.end = fpos + flen
                        r.strand = 0
                    r.name = seq_name
                    r.qual = qual

                    multi_reads.append(r)

        if len(multi_reads) > 0:
            (sel_reads, w) = self.sameFam(multi_reads, teIdx)
            # w = 1.0/len(multi_reads)
            for k in range(len(sel_reads)):
                rr = sel_reads[k]
                #w = weights[k]
                #w = round(1.0/len(multi_reads),2)
                #for k in range(len(multi_reads)) :
                #   rr = multi_reads[k]
                fwtrack.add_loc(rr.chrom, rr.start, rr.strand, w)

        self.fhd.close()
        if seq_len_count > 0:
            fwtrack.setTsize(int(seq_len / seq_len_count))

        self.__buildAready = True

        return fwtrack
Exemple #2
0
    def build_fwtrack_v2 (self,teIdx):
        """Build FWTrackII from all lines, return a FWTrackII object.

        Note only the unique match for a tag is kept.
        """
        fwtrack = FWTrackII(filename=self.__srcfile)
        fseek = self.fhd.seek
        fread = self.fhd.read
        ftell = self.fhd.tell
        references = []
        # move to pos 4, get the length of header
        fseek(4)
        header_len =  struct.unpack('<i', fread(4))[0]
        fseek(header_len + ftell())
        # get the number of chromosome
        nc = struct.unpack('<i', fread(4))[0]
     
        for x in range(nc):
            # read each chromosome name
            nlength = struct.unpack('<i', fread(4))[0]
            chrname = fread(nlength)[:-1]
            references.append(chrname)
            # jump over chromosome size, we don't need it
            fseek(ftell() + 4)
        
        i = 0
        m = 0
        multi_reads = []
        prev_seq = ""
        seq_len = 0
        seq_len_count = 0
        while 1:
            try:
                entrylength = struct.unpack('<i', fread(4))[0]
            except struct.error:
               
                break
            (seq_name,chrid,fpos,flen,strand,qual) = self.__binary_parse(fread(entrylength))  
            if seq_len_count < 1000 :
                seq_len += flen
                seq_len_count += 1
            
                                  
            i+=1
            if i == 1000000:
                m += 1
                logging.info(" %d" % (m*1000000))
                i=0

            if fpos >= 0:
                if seq_name == prev_seq : #multi reads
                    r = Read()
                    r.chrom = references[chrid]
                    if strand == 1 :
                        r.strand = 1
                        r.start = fpos - flen
                        r.end = fpos
                    else :
                        r.start = fpos
                        r.end = fpos + flen
                        r.strand = 0
                    r.name = seq_name
                    r.weight = 0
                    r.qual = qual
                    
                    multi_reads.append(r)
                else :
                    if prev_seq != "" :
                        (sel_reads,w) = self.sameFam(multi_reads,teIdx)
                       # w = 1.0/len(multi_reads)
                        for k in range(len(sel_reads)) :
                            rr = sel_reads[k]
                            #w = weights[k]                        
                       # w = round(1.0/len(multi_reads),2)
                        #for k in range(len(multi_reads)) :
                         #   rr = multi_reads[k]
                            fwtrack.add_loc(rr.chrom,rr.start,rr.strand,w)
                            
                    multi_reads = []
                    prev_seq = seq_name
                    r = Read()
                    r.chrom = references[chrid]
                    if strand == 1 :
                        r.strand = 1
                        r.start = fpos - flen
                        r.end = fpos
                    else :
                        r.start = fpos
                        r.end = fpos + flen
                        r.strand = 0
                    r.name = seq_name
                    r.qual = qual
                    
                    multi_reads.append(r)
        
        
        if len(multi_reads) > 0 :
            (sel_reads,w) = self.sameFam(multi_reads,teIdx)
                       # w = 1.0/len(multi_reads)
            for k in range(len(sel_reads)) :
                 rr = sel_reads[k]
                 #w = weights[k]
            #w = round(1.0/len(multi_reads),2)
            #for k in range(len(multi_reads)) :
             #   rr = multi_reads[k]
                 fwtrack.add_loc(rr.chrom,rr.start,rr.strand,w)
                 
                                
        self.fhd.close()
        if seq_len_count > 0 :
            fwtrack.setTsize(int(seq_len/seq_len_count))

        self.__buildAready = True
        
        return fwtrack
Exemple #3
0
    def build_fwtrack_v2(self, teIdx):

        fwtrack = FWTrackII(filename=self.__srcfile)
        i = 0
        m = 0
        pre_seq_name = ""
        multi_reads = []
        seq_len_count = 0
        seq_len = 0
        #      cnt = 0
        strand = 0
        try:
            f = open(self.__srcfile, 'r')
        except IOError:
            logging.error("open file %s error !\n" % (self.__srcfile))
            sys.exit(1)
        else:
            for line in f:
                # Go through bed file and assign each line to corresponding file.

                line = line.strip()
                items = line.split('\t')
                chrname = items[0]

                if seq_len_count < 1000:
                    seq_len += int(items[2]) - int(items[1])
                    seq_len_count += 1

                i += 1
                start = 0

                if i == 1000000:
                    m += 1
                    logging.info(" %d" % (m * 1000000))
                    i = 0

                if items[5] == "+":
                    strand = 0
                    start = int(items[1])
                    end = int(items[2])
                else:
                    strand = 1
                    start = int(items[2])

                w = 1.0
                if len(items) > 6:  #there is weight assigned to each alignment
                    w = float(items[6])
                    #    self.size += w
                    fwtrack.add_loc(chrname, start, strand, w)

                else:
                    if pre_seq_name == "":
                        pre_seq_name = items[3]

                    r = Read()
                    r.chrom = chrname
                    r.start = start

                    r.strand = strand
                    if pre_seq_name == items[3]:
                        multi_reads.append(r)
                    else:
                        (sel_reads, w) = self.sameFam(multi_reads, teIdx,
                                                      pre_seq_name)
                        # w = 1.0/len(multi_reads)
                        for k in range(len(sel_reads)):
                            read = sel_reads[k]

                            #w = weights[k]
                            fwtrack.add_loc(read.chrom, read.start,
                                            read.strand, w)
                            #self.size
                        multi_reads = []
                        pre_seq_name = items[3]
                        multi_reads.append(r)

            #  else:
            #     logging.warn("Unspecified chromosome name at %s line: %s. Skip!\n" %(self.__srcfile,line))

            if len(multi_reads) > 0:

                # w = float(1.0/len(multi_reads))
                #for k in range(len(multi_reads)) :
                # read = multi_reads[k]
                (sel_reads, w) = self.sameFam(multi_reads, teIdx)
                # w = 1.0/len(multi_reads)
                for k in range(len(sel_reads)):
                    read = sel_reads[k]
                    #w = weights[k]
                    fwtrack.add_loc(read.chrom, read.start, read.strand, w)

            f.close()
        if seq_len_count > 0:
            fwtrack.setTsize(int(seq_len / seq_len_count))

        fwtrack.sort()
        self.__buildAready = True

        return fwtrack
Exemple #4
0
    def build_fwtrack_v2 (self,teIdx):

        
        fwtrack = FWTrackII(filename=self.__srcfile)
        i = 0
        m = 0
        pre_seq_name = ""
        multi_reads = []
        seq_len_count = 0
        seq_len = 0
  #      cnt = 0
        strand = 0
        try:
            f = open(self.__srcfile,'r')
        except IOError:
            logging.error("open file %s error !\n" %(self.__srcfile))
            sys.exit(1)
        else:
            for line in f:                
        # Go through bed file and assign each line to corresponding file.
                
                line = line.strip()
                items = line.split('\t')
                chrname = items[0]
                
                if seq_len_count < 1000 :
                    seq_len += int(items[2]) - int(items[1])
                    seq_len_count += 1

                i+=1
                start = 0
                
                if i == 1000000:
                        m += 1
                        logging.info(" %d" % (m*1000000))
                        i=0
                
                if items[5] == "+" :
                        strand = 0
                        start = int(items[1])
                        end = int(items[2])
                else :
                        strand = 1         
                        start = int(items[2])
                               
                w = 1.0
                if len(items) > 6 : #there is weight assigned to each alignment
                    w = float(items[6])
                #    self.size += w 
                    fwtrack.add_loc(chrname,start,strand,w)
                    
                else :  
                    if pre_seq_name == "" :
                        pre_seq_name = items[3] 
                         
                    r = Read()
                    r.chrom = chrname
                    r.start = start
                    
                    r.strand = strand
                    if pre_seq_name == items[3] :
                        multi_reads.append(r)
                    else :
                        (sel_reads,w) = self.sameFam(multi_reads,teIdx,pre_seq_name)
                       # w = 1.0/len(multi_reads)
                        for k in range(len(sel_reads)) :
                            read = sel_reads[k]
                          
                            
                            #w = weights[k]
                            fwtrack.add_loc(read.chrom,read.start,read.strand,w)
                            #self.size
                        multi_reads = []
                        pre_seq_name = items[3]
                        multi_reads.append(r)
                    
              #  else:
               #     logging.warn("Unspecified chromosome name at %s line: %s. Skip!\n" %(self.__srcfile,line))
            
            if len(multi_reads) > 0 :
                
             # w = float(1.0/len(multi_reads))
              #for k in range(len(multi_reads)) :
               # read = multi_reads[k]
              (sel_reads,w) = self.sameFam(multi_reads,teIdx)
                       # w = 1.0/len(multi_reads)
              for k in range(len(sel_reads)) :
                            read = sel_reads[k]
                            #w = weights[k]  
                            fwtrack.add_loc(read.chrom,read.start,read.strand,w)
                         
            f.close()               
        if seq_len_count > 0 :
            fwtrack.setTsize(int(seq_len/seq_len_count))    
        
        fwtrack.sort()
        self.__buildAready = True
       
        return fwtrack