コード例 #1
0
    def build_fwtrack_v2(self, teIdx):
        """Build FWTrackII from all lines, return a FWTrackII object.

        Note only the unique match for a tag is kept.
        """
        fwtrack = FWTrackII(filename=self.__srcfile)
        fseek = self.fhd.seek
        fread = self.fhd.read
        ftell = self.fhd.tell
        references = []
        # move to pos 4, get the length of header
        fseek(4)
        header_len = struct.unpack('<i', fread(4))[0]
        fseek(header_len + ftell())
        # get the number of chromosome
        nc = struct.unpack('<i', fread(4))[0]

        for x in range(nc):
            # read each chromosome name
            nlength = struct.unpack('<i', fread(4))[0]
            chrname = fread(nlength)[:-1]
            references.append(chrname)
            # jump over chromosome size, we don't need it
            fseek(ftell() + 4)

        i = 0
        m = 0
        multi_reads = []
        prev_seq = ""
        seq_len = 0
        seq_len_count = 0
        while 1:
            try:
                entrylength = struct.unpack('<i', fread(4))[0]
            except struct.error:

                break
            (seq_name, chrid, fpos, flen, strand,
             qual) = self.__binary_parse(fread(entrylength))
            if seq_len_count < 1000:
                seq_len += flen
                seq_len_count += 1

            i += 1
            if i == 1000000:
                m += 1
                logging.info(" %d" % (m * 1000000))
                i = 0

            if fpos >= 0:
                if seq_name == prev_seq:  #multi reads
                    r = Read()
                    r.chrom = references[chrid]
                    if strand == 1:
                        r.strand = 1
                        r.start = fpos - flen
                        r.end = fpos
                    else:
                        r.start = fpos
                        r.end = fpos + flen
                        r.strand = 0
                    r.name = seq_name
                    r.weight = 0
                    r.qual = qual

                    multi_reads.append(r)
                else:
                    if prev_seq != "":
                        (sel_reads, w) = self.sameFam(multi_reads, teIdx)
                        # w = 1.0/len(multi_reads)
                        for k in range(len(sel_reads)):
                            rr = sel_reads[k]
                            #w = weights[k]
                            # w = round(1.0/len(multi_reads),2)
                            #for k in range(len(multi_reads)) :
                            #   rr = multi_reads[k]
                            fwtrack.add_loc(rr.chrom, rr.start, rr.strand, w)

                    multi_reads = []
                    prev_seq = seq_name
                    r = Read()
                    r.chrom = references[chrid]
                    if strand == 1:
                        r.strand = 1
                        r.start = fpos - flen
                        r.end = fpos
                    else:
                        r.start = fpos
                        r.end = fpos + flen
                        r.strand = 0
                    r.name = seq_name
                    r.qual = qual

                    multi_reads.append(r)

        if len(multi_reads) > 0:
            (sel_reads, w) = self.sameFam(multi_reads, teIdx)
            # w = 1.0/len(multi_reads)
            for k in range(len(sel_reads)):
                rr = sel_reads[k]
                #w = weights[k]
                #w = round(1.0/len(multi_reads),2)
                #for k in range(len(multi_reads)) :
                #   rr = multi_reads[k]
                fwtrack.add_loc(rr.chrom, rr.start, rr.strand, w)

        self.fhd.close()
        if seq_len_count > 0:
            fwtrack.setTsize(int(seq_len / seq_len_count))

        self.__buildAready = True

        return fwtrack
コード例 #2
0
    def build_fwtrack_v2(self, teIdx):

        fwtrack = FWTrackII(filename=self.__srcfile)
        i = 0
        m = 0
        pre_seq_name = ""
        multi_reads = []
        seq_len_count = 0
        seq_len = 0
        #      cnt = 0
        strand = 0
        try:
            f = open(self.__srcfile, 'r')
        except IOError:
            logging.error("open file %s error !\n" % (self.__srcfile))
            sys.exit(1)
        else:
            for line in f:
                # Go through bed file and assign each line to corresponding file.

                line = line.strip()
                items = line.split('\t')
                chrname = items[0]

                if seq_len_count < 1000:
                    seq_len += int(items[2]) - int(items[1])
                    seq_len_count += 1

                i += 1
                start = 0

                if i == 1000000:
                    m += 1
                    logging.info(" %d" % (m * 1000000))
                    i = 0

                if items[5] == "+":
                    strand = 0
                    start = int(items[1])
                    end = int(items[2])
                else:
                    strand = 1
                    start = int(items[2])

                w = 1.0
                if len(items) > 6:  #there is weight assigned to each alignment
                    w = float(items[6])
                    #    self.size += w
                    fwtrack.add_loc(chrname, start, strand, w)

                else:
                    if pre_seq_name == "":
                        pre_seq_name = items[3]

                    r = Read()
                    r.chrom = chrname
                    r.start = start

                    r.strand = strand
                    if pre_seq_name == items[3]:
                        multi_reads.append(r)
                    else:
                        (sel_reads, w) = self.sameFam(multi_reads, teIdx,
                                                      pre_seq_name)
                        # w = 1.0/len(multi_reads)
                        for k in range(len(sel_reads)):
                            read = sel_reads[k]

                            #w = weights[k]
                            fwtrack.add_loc(read.chrom, read.start,
                                            read.strand, w)
                            #self.size
                        multi_reads = []
                        pre_seq_name = items[3]
                        multi_reads.append(r)

            #  else:
            #     logging.warn("Unspecified chromosome name at %s line: %s. Skip!\n" %(self.__srcfile,line))

            if len(multi_reads) > 0:

                # w = float(1.0/len(multi_reads))
                #for k in range(len(multi_reads)) :
                # read = multi_reads[k]
                (sel_reads, w) = self.sameFam(multi_reads, teIdx)
                # w = 1.0/len(multi_reads)
                for k in range(len(sel_reads)):
                    read = sel_reads[k]
                    #w = weights[k]
                    fwtrack.add_loc(read.chrom, read.start, read.strand, w)

            f.close()
        if seq_len_count > 0:
            fwtrack.setTsize(int(seq_len / seq_len_count))

        fwtrack.sort()
        self.__buildAready = True

        return fwtrack