def build_fwtrack(self, temode): """Build FWTrackII from all lines, return a FWTrackII object. Note only the unique match for a tag is kept. """ fwtrack = FWTrackII(filename=self.__srcfile) fseek = self.fhd.seek fread = self.fhd.read ftell = self.fhd.tell references = [] # move to pos 4, get the length of header fseek(4) header_len = struct.unpack('<i', fread(4))[0] fseek(header_len + ftell()) # get the number of chromosome nc = struct.unpack('<i', fread(4))[0] for x in range(nc): # read each chromosome name nlength = struct.unpack('<i', fread(4))[0] chrname = fread(nlength)[:-1] references.append(chrname) # jump over chromosome size, we don't need it fseek(ftell() + 4) i = 0 m = 0 multi_reads = [] prev_seq = "" seq_len = 0 seq_len_count = 0 while 1: try: entrylength = struct.unpack('<i', fread(4))[0] except struct.error: break (seq_name, chrid, fpos, flen, strand, qual) = self.__binary_parse(fread(entrylength)) if seq_len_count < 1000: seq_len += flen seq_len_count += 1 i += 1 if i == 1000000: m += 1 logging.info(" %d" % (m * 1000000)) i = 0 if fpos >= 0: if seq_name == prev_seq: #multi reads r = Read() r.chrom = references[chrid] if strand == 1: r.strand = 1 r.start = fpos - flen r.end = fpos else: r.start = fpos r.end = fpos + flen r.strand = 0 r.name = seq_name r.weight = 0 r.qual = qual multi_reads.append(r) else: if prev_seq != "": if (temode == 'uniq' and len(multi_reads) == 1) or temode == 'multi': w = round(1.0 / len(multi_reads), 2) for k in range(len(multi_reads)): rr = multi_reads[k] fwtrack.add_loc(rr.chrom, rr.start, rr.strand, w) multi_reads = [] prev_seq = seq_name r = Read() r.chrom = references[chrid] if strand == 1: r.strand = 1 r.start = fpos - flen r.end = fpos else: r.start = fpos r.end = fpos + flen r.strand = 0 r.name = seq_name r.qual = qual multi_reads.append(r) if len(multi_reads) > 0: if (temode == 'uniq' and len(multi_reads) == 1) or temode == 'multi': w = round(1.0 / len(multi_reads), 2) for k in range(len(multi_reads)): rr = multi_reads[k] fwtrack.add_loc(rr.chrom, rr.start, rr.strand, w) self.fhd.close() if seq_len_count > 0: fwtrack.setTsize(int(seq_len / seq_len_count)) self.__buildAready = True return fwtrack
def build_fwtrack_v2 (self,teIdx): """Build FWTrackII from all lines, return a FWTrackII object. Note only the unique match for a tag is kept. """ fwtrack = FWTrackII(filename=self.__srcfile) fseek = self.fhd.seek fread = self.fhd.read ftell = self.fhd.tell references = [] # move to pos 4, get the length of header fseek(4) header_len = struct.unpack('<i', fread(4))[0] fseek(header_len + ftell()) # get the number of chromosome nc = struct.unpack('<i', fread(4))[0] for x in range(nc): # read each chromosome name nlength = struct.unpack('<i', fread(4))[0] chrname = fread(nlength)[:-1] references.append(chrname) # jump over chromosome size, we don't need it fseek(ftell() + 4) i = 0 m = 0 multi_reads = [] prev_seq = "" seq_len = 0 seq_len_count = 0 while 1: try: entrylength = struct.unpack('<i', fread(4))[0] except struct.error: break (seq_name,chrid,fpos,flen,strand,qual) = self.__binary_parse(fread(entrylength)) if seq_len_count < 1000 : seq_len += flen seq_len_count += 1 i+=1 if i == 1000000: m += 1 logging.info(" %d" % (m*1000000)) i=0 if fpos >= 0: if seq_name == prev_seq : #multi reads r = Read() r.chrom = references[chrid] if strand == 1 : r.strand = 1 r.start = fpos - flen r.end = fpos else : r.start = fpos r.end = fpos + flen r.strand = 0 r.name = seq_name r.weight = 0 r.qual = qual multi_reads.append(r) else : if prev_seq != "" : (sel_reads,w) = self.sameFam(multi_reads,teIdx) # w = 1.0/len(multi_reads) for k in range(len(sel_reads)) : rr = sel_reads[k] #w = weights[k] # w = round(1.0/len(multi_reads),2) #for k in range(len(multi_reads)) : # rr = multi_reads[k] fwtrack.add_loc(rr.chrom,rr.start,rr.strand,w) multi_reads = [] prev_seq = seq_name r = Read() r.chrom = references[chrid] if strand == 1 : r.strand = 1 r.start = fpos - flen r.end = fpos else : r.start = fpos r.end = fpos + flen r.strand = 0 r.name = seq_name r.qual = qual multi_reads.append(r) if len(multi_reads) > 0 : (sel_reads,w) = self.sameFam(multi_reads,teIdx) # w = 1.0/len(multi_reads) for k in range(len(sel_reads)) : rr = sel_reads[k] #w = weights[k] #w = round(1.0/len(multi_reads),2) #for k in range(len(multi_reads)) : # rr = multi_reads[k] fwtrack.add_loc(rr.chrom,rr.start,rr.strand,w) self.fhd.close() if seq_len_count > 0 : fwtrack.setTsize(int(seq_len/seq_len_count)) self.__buildAready = True return fwtrack
def build_fwtrack_v2(self, teIdx): fwtrack = FWTrackII(filename=self.__srcfile) i = 0 m = 0 pre_seq_name = "" multi_reads = [] seq_len_count = 0 seq_len = 0 # cnt = 0 strand = 0 try: f = open(self.__srcfile, 'r') except IOError: logging.error("open file %s error !\n" % (self.__srcfile)) sys.exit(1) else: for line in f: # Go through bed file and assign each line to corresponding file. line = line.strip() items = line.split('\t') chrname = items[0] if seq_len_count < 1000: seq_len += int(items[2]) - int(items[1]) seq_len_count += 1 i += 1 start = 0 if i == 1000000: m += 1 logging.info(" %d" % (m * 1000000)) i = 0 if items[5] == "+": strand = 0 start = int(items[1]) end = int(items[2]) else: strand = 1 start = int(items[2]) w = 1.0 if len(items) > 6: #there is weight assigned to each alignment w = float(items[6]) # self.size += w fwtrack.add_loc(chrname, start, strand, w) else: if pre_seq_name == "": pre_seq_name = items[3] r = Read() r.chrom = chrname r.start = start r.strand = strand if pre_seq_name == items[3]: multi_reads.append(r) else: (sel_reads, w) = self.sameFam(multi_reads, teIdx, pre_seq_name) # w = 1.0/len(multi_reads) for k in range(len(sel_reads)): read = sel_reads[k] #w = weights[k] fwtrack.add_loc(read.chrom, read.start, read.strand, w) #self.size multi_reads = [] pre_seq_name = items[3] multi_reads.append(r) # else: # logging.warn("Unspecified chromosome name at %s line: %s. Skip!\n" %(self.__srcfile,line)) if len(multi_reads) > 0: # w = float(1.0/len(multi_reads)) #for k in range(len(multi_reads)) : # read = multi_reads[k] (sel_reads, w) = self.sameFam(multi_reads, teIdx) # w = 1.0/len(multi_reads) for k in range(len(sel_reads)): read = sel_reads[k] #w = weights[k] fwtrack.add_loc(read.chrom, read.start, read.strand, w) f.close() if seq_len_count > 0: fwtrack.setTsize(int(seq_len / seq_len_count)) fwtrack.sort() self.__buildAready = True return fwtrack
def build_fwtrack_v2 (self,teIdx): fwtrack = FWTrackII(filename=self.__srcfile) i = 0 m = 0 pre_seq_name = "" multi_reads = [] seq_len_count = 0 seq_len = 0 # cnt = 0 strand = 0 try: f = open(self.__srcfile,'r') except IOError: logging.error("open file %s error !\n" %(self.__srcfile)) sys.exit(1) else: for line in f: # Go through bed file and assign each line to corresponding file. line = line.strip() items = line.split('\t') chrname = items[0] if seq_len_count < 1000 : seq_len += int(items[2]) - int(items[1]) seq_len_count += 1 i+=1 start = 0 if i == 1000000: m += 1 logging.info(" %d" % (m*1000000)) i=0 if items[5] == "+" : strand = 0 start = int(items[1]) end = int(items[2]) else : strand = 1 start = int(items[2]) w = 1.0 if len(items) > 6 : #there is weight assigned to each alignment w = float(items[6]) # self.size += w fwtrack.add_loc(chrname,start,strand,w) else : if pre_seq_name == "" : pre_seq_name = items[3] r = Read() r.chrom = chrname r.start = start r.strand = strand if pre_seq_name == items[3] : multi_reads.append(r) else : (sel_reads,w) = self.sameFam(multi_reads,teIdx,pre_seq_name) # w = 1.0/len(multi_reads) for k in range(len(sel_reads)) : read = sel_reads[k] #w = weights[k] fwtrack.add_loc(read.chrom,read.start,read.strand,w) #self.size multi_reads = [] pre_seq_name = items[3] multi_reads.append(r) # else: # logging.warn("Unspecified chromosome name at %s line: %s. Skip!\n" %(self.__srcfile,line)) if len(multi_reads) > 0 : # w = float(1.0/len(multi_reads)) #for k in range(len(multi_reads)) : # read = multi_reads[k] (sel_reads,w) = self.sameFam(multi_reads,teIdx) # w = 1.0/len(multi_reads) for k in range(len(sel_reads)) : read = sel_reads[k] #w = weights[k] fwtrack.add_loc(read.chrom,read.start,read.strand,w) f.close() if seq_len_count > 0 : fwtrack.setTsize(int(seq_len/seq_len_count)) fwtrack.sort() self.__buildAready = True return fwtrack