Esempio n. 1
0
    def build_fragment(self):
        # pre-chunk the fragment sequence at feature start and end locations.
        # there should be no need to further divide any chunk during import.
        break_points = list(set(
            [f[0] for f in self.__features] +
            [f[1] + 1 for f in self.__features]))
        break_points = sorted(break_points)
        chunk_sizes = []
        for i, bp in enumerate(break_points):
            if i == 0:
                if bp > 1:
                    chunk_sizes.append(break_points[i] - 1)
            else:
                chunk_sizes.append(break_points[i] - break_points[i - 1])
        print '%d chunks' % (len(chunk_sizes),)

        new_fragment = Fragment(name=self.__rec.id, circular=False, parent=None, start_chunk=None)
        new_fragment.save()
        new_fragment = new_fragment.indexed_fragment()

        prev = None
        flen = 0
        seqlen = len(self.__sequence)
        for sz in chunk_sizes:
            prev = new_fragment._append_to_fragment(prev, flen, self.__sequence[flen:flen + sz])
            flen += sz
        if flen < seqlen:
            f = new_fragment._append_to_fragment(prev, flen, self.__sequence[flen:seqlen])

        return new_fragment
Esempio n. 2
0
    def build_fragment(self):
        # pre-chunk the fragment sequence at feature start and end locations.
        # there should be no need to further divide any chunk during import.
        break_points = list(
            set([f[0] for f in self.__features] +
                [f[1] + 1 for f in self.__features]))
        break_points = sorted(break_points)
        chunk_sizes = []
        for i, bp in enumerate(break_points):
            if i == 0:
                if bp > 1:
                    chunk_sizes.append(break_points[i] - 1)
            else:
                chunk_sizes.append(break_points[i] - break_points[i - 1])
        print '%d chunks' % (len(chunk_sizes), )

        new_fragment = Fragment(name=self.__rec.id,
                                circular=False,
                                parent=None,
                                start_chunk=None)
        new_fragment.save()
        new_fragment = new_fragment.indexed_fragment()

        prev = None
        flen = 0
        seqlen = len(self.__sequence)
        for sz in chunk_sizes:
            prev = new_fragment._append_to_fragment(
                prev, flen, self.__sequence[flen:flen + sz])
            flen += sz
        if flen < seqlen:
            f = new_fragment._append_to_fragment(prev, flen,
                                                 self.__sequence[flen:seqlen])

        return new_fragment
Esempio n. 3
0
    def build_fragment(self, reference_based=True, dirn='.'):
        # pre-chunk the fragment sequence at feature start and end locations.
        # there should be no need to further divide any chunk during import.
        starts_and_ends = []
        for feature in self.__features:
            name = feature[2]
            starts_and_ends.append(feature[0])
            starts_and_ends.append(feature[1] + 1)
            for subfeature in self.__subfeatures_dict[name]:
                starts_and_ends.append(subfeature[0])
                starts_and_ends.append(subfeature[1] + 1)
        break_points = sorted(list(set(starts_and_ends)))

        cur_len = 0
        chunk_sizes = []
        for i, bp in enumerate(break_points):
            if i == 0:
                if bp > 1:
                    chunk_sizes.append(break_points[i] - 1)
                    cur_len += chunk_sizes[-1]
            else:
                chunk_sizes.append(break_points[i] - break_points[i - 1])
                cur_len += chunk_sizes[-1]

        if cur_len < self.__seqlen:
            chunk_sizes.append(self.__seqlen - cur_len)

        fragment_circular = False
        for feature in self.__rec.features:
            # skip features that cover the entire sequence
            if feature.type.upper() in ['REGION', 'CHR', 'CHROM', 'CHROMOSOME']:
                if 'Is_circular' in feature.qualifiers:
                    fragment_circular = feature.qualifiers['Is_circular'][0].upper() == 'TRUE'
                break

        new_fragment = Fragment(
            name=self.__rec.id, circular=fragment_circular, parent=None, start_chunk=None
        )
        new_fragment.save()
        print("Fragment %s" % (new_fragment.id))
        new_fragment = new_fragment.indexed_fragment()

        if reference_based:
            print("%d chunks" % (len(chunk_sizes),))
            t0 = time.time()
            Chunk.CHUNK_REFERENCE_CLASS.generate_from_fragment(
                new_fragment, str(self.__rec.seq), dirn=dirn
            )
            print("Reference file generation took %s seconds" % (time.time() - t0))

            new_fragment._bulk_create_fragment_chunks(chunk_sizes)
            return new_fragment

        # divide chunks bigger than a certain threshold to smaller chunks, to
        # allow insertion of sequence into database. e.g. MySQL has a packet
        # size that prevents chunks that are too large from being inserted.
        chunk_size_limit = 1000000
        new_chunk_sizes = []
        for original_chunk_size in chunk_sizes:
            if original_chunk_size < chunk_size_limit:
                new_chunk_sizes.append(original_chunk_size)
            else:
                divided_chunks = []
                while original_chunk_size > 0:
                    divided_chunks.append(min(original_chunk_size, chunk_size_limit))
                    original_chunk_size -= chunk_size_limit
                new_chunk_sizes.extend(divided_chunks)
        chunk_sizes = new_chunk_sizes
        print("%d chunks" % (len(chunk_sizes),))

        prev = None
        fragment_len = 0
        for chunk_size in chunk_sizes:
            t0 = time.time()
            prev = new_fragment._append_to_fragment(
                prev,
                fragment_len,
                str(self.__rec.seq[fragment_len : fragment_len + chunk_size]),
            )
            fragment_len += chunk_size
            print("add chunk to fragment: %.4f\r" % (time.time() - t0,), end="")

        return new_fragment