def build_fragment(self): # pre-chunk the fragment sequence at feature start and end locations. # there should be no need to further divide any chunk during import. break_points = list(set( [f[0] for f in self.__features] + [f[1] + 1 for f in self.__features])) break_points = sorted(break_points) chunk_sizes = [] for i, bp in enumerate(break_points): if i == 0: if bp > 1: chunk_sizes.append(break_points[i] - 1) else: chunk_sizes.append(break_points[i] - break_points[i - 1]) print '%d chunks' % (len(chunk_sizes),) new_fragment = Fragment(name=self.__rec.id, circular=False, parent=None, start_chunk=None) new_fragment.save() new_fragment = new_fragment.indexed_fragment() prev = None flen = 0 seqlen = len(self.__sequence) for sz in chunk_sizes: prev = new_fragment._append_to_fragment(prev, flen, self.__sequence[flen:flen + sz]) flen += sz if flen < seqlen: f = new_fragment._append_to_fragment(prev, flen, self.__sequence[flen:seqlen]) return new_fragment
def build_fragment(self): # pre-chunk the fragment sequence at feature start and end locations. # there should be no need to further divide any chunk during import. break_points = list( set([f[0] for f in self.__features] + [f[1] + 1 for f in self.__features])) break_points = sorted(break_points) chunk_sizes = [] for i, bp in enumerate(break_points): if i == 0: if bp > 1: chunk_sizes.append(break_points[i] - 1) else: chunk_sizes.append(break_points[i] - break_points[i - 1]) print '%d chunks' % (len(chunk_sizes), ) new_fragment = Fragment(name=self.__rec.id, circular=False, parent=None, start_chunk=None) new_fragment.save() new_fragment = new_fragment.indexed_fragment() prev = None flen = 0 seqlen = len(self.__sequence) for sz in chunk_sizes: prev = new_fragment._append_to_fragment( prev, flen, self.__sequence[flen:flen + sz]) flen += sz if flen < seqlen: f = new_fragment._append_to_fragment(prev, flen, self.__sequence[flen:seqlen]) return new_fragment
def test_finds_genomes_with_specified_fragment_ids(self): g1 = Genome(name="Foo") g1.save() g2 = Genome(name="Bar") g2.save() f1 = Fragment(circular=True, name="FooF1") f1.save() f2 = Fragment(circular=True, name="FooF2") f2.save() f3 = Fragment(circular=True, name="FooF3", parent=f2) f3.save() Genome_Fragment(genome=g1, fragment=f1, inherited=False).save() Genome_Fragment(genome=g1, fragment=f2, inherited=False).save() Genome_Fragment(genome=g2, fragment=f1, inherited=True).save() Genome_Fragment(genome=g2, fragment=f3, inherited=False).save() # no filter, return both genomes url = reverse("genome_list") res = self.client.get(url) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertCountEqual([g["id"] for g in d], [g1.id, g2.id]) # looking for f1 and f2 res = self.client.get("%s?f=%d&f=%d" % (url, f1.id, f2.id)) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertCountEqual([g["id"] for g in d], [g1.id]) # looking for f1 and f3 res = self.client.get("%s?f=%d&f=%d" % (url, f1.id, f3.id)) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertCountEqual([g["id"] for g in d], [g2.id]) # looking for f2 and f3 res = self.client.get("%s?f=%d&f=%d" % (url, f2.id, f3.id)) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertEquals(d, []) # looking for f1 res = self.client.get("%s?f=%d" % ( url, f1.id, )) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertEquals(d, []) # bad input, return [] res = self.client.get("%s?f=[1,2,3]" % url) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertEquals(d, [])
def test_finds_genomes_with_specified_fragment_ids(self): from edge.models import Genome, Fragment, Genome_Fragment g1 = Genome(name='Foo') g1.save() g2 = Genome(name='Bar') g2.save() f1 = Fragment(circular=True, name='FooF1') f1.save() f2 = Fragment(circular=True, name='FooF2') f2.save() f3 = Fragment(circular=True, name='FooF3', parent=f2) f3.save() Genome_Fragment(genome=g1, fragment=f1, inherited=False).save() Genome_Fragment(genome=g1, fragment=f2, inherited=False).save() Genome_Fragment(genome=g2, fragment=f1, inherited=True).save() Genome_Fragment(genome=g2, fragment=f3, inherited=False).save() # no filter, return both genomes res = self.client.get('/edge/genomes/') self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertItemsEqual([g['id'] for g in d], [g1.id, g2.id]) # looking for f1 and f2 res = self.client.get('/edge/genomes/?f=%d&f=%d' % (f1.id, f2.id)) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertItemsEqual([g['id'] for g in d], [g1.id]) # looking for f1 and f3 res = self.client.get('/edge/genomes/?f=%d&f=%d' % (f1.id, f3.id)) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertItemsEqual([g['id'] for g in d], [g2.id]) # looking for f2 and f3 res = self.client.get('/edge/genomes/?f=%d&f=%d' % (f2.id, f3.id)) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertEquals(d, []) # looking for f1 res = self.client.get('/edge/genomes/?f=%d' % (f1.id,)) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertEquals(d, []) # bad input, return [] res = self.client.get('/edge/genomes/?f=[1,2,3]') self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertEquals(d, [])
def test_finds_genomes_with_specified_fragment_ids(self): g1 = Genome(name='Foo') g1.save() g2 = Genome(name='Bar') g2.save() f1 = Fragment(circular=True, name='FooF1') f1.save() f2 = Fragment(circular=True, name='FooF2') f2.save() f3 = Fragment(circular=True, name='FooF3', parent=f2) f3.save() Genome_Fragment(genome=g1, fragment=f1, inherited=False).save() Genome_Fragment(genome=g1, fragment=f2, inherited=False).save() Genome_Fragment(genome=g2, fragment=f1, inherited=True).save() Genome_Fragment(genome=g2, fragment=f3, inherited=False).save() # no filter, return both genomes res = self.client.get('/edge/genomes/') self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertItemsEqual([g['id'] for g in d], [g1.id, g2.id]) # looking for f1 and f2 res = self.client.get('/edge/genomes/?f=%d&f=%d' % (f1.id, f2.id)) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertItemsEqual([g['id'] for g in d], [g1.id]) # looking for f1 and f3 res = self.client.get('/edge/genomes/?f=%d&f=%d' % (f1.id, f3.id)) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertItemsEqual([g['id'] for g in d], [g2.id]) # looking for f2 and f3 res = self.client.get('/edge/genomes/?f=%d&f=%d' % (f2.id, f3.id)) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertEquals(d, []) # looking for f1 res = self.client.get('/edge/genomes/?f=%d' % (f1.id, )) self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertEquals(d, []) # bad input, return [] res = self.client.get('/edge/genomes/?f=[1,2,3]') self.assertEquals(res.status_code, 200) d = json.loads(res.content) self.assertEquals(d, [])
def build_fragment(self, reference_based=True, dirn='.'): # pre-chunk the fragment sequence at feature start and end locations. # there should be no need to further divide any chunk during import. starts_and_ends = [] for feature in self.__features: name = feature[2] starts_and_ends.append(feature[0]) starts_and_ends.append(feature[1] + 1) for subfeature in self.__subfeatures_dict[name]: starts_and_ends.append(subfeature[0]) starts_and_ends.append(subfeature[1] + 1) break_points = sorted(list(set(starts_and_ends))) cur_len = 0 chunk_sizes = [] for i, bp in enumerate(break_points): if i == 0: if bp > 1: chunk_sizes.append(break_points[i] - 1) cur_len += chunk_sizes[-1] else: chunk_sizes.append(break_points[i] - break_points[i - 1]) cur_len += chunk_sizes[-1] if cur_len < self.__seqlen: chunk_sizes.append(self.__seqlen - cur_len) fragment_circular = False for feature in self.__rec.features: # skip features that cover the entire sequence if feature.type.upper() in ['REGION', 'CHR', 'CHROM', 'CHROMOSOME']: if 'Is_circular' in feature.qualifiers: fragment_circular = feature.qualifiers['Is_circular'][0].upper() == 'TRUE' break new_fragment = Fragment( name=self.__rec.id, circular=fragment_circular, parent=None, start_chunk=None ) new_fragment.save() print("Fragment %s" % (new_fragment.id)) new_fragment = new_fragment.indexed_fragment() if reference_based: print("%d chunks" % (len(chunk_sizes),)) t0 = time.time() Chunk.CHUNK_REFERENCE_CLASS.generate_from_fragment( new_fragment, str(self.__rec.seq), dirn=dirn ) print("Reference file generation took %s seconds" % (time.time() - t0)) new_fragment._bulk_create_fragment_chunks(chunk_sizes) return new_fragment # divide chunks bigger than a certain threshold to smaller chunks, to # allow insertion of sequence into database. e.g. MySQL has a packet # size that prevents chunks that are too large from being inserted. chunk_size_limit = 1000000 new_chunk_sizes = [] for original_chunk_size in chunk_sizes: if original_chunk_size < chunk_size_limit: new_chunk_sizes.append(original_chunk_size) else: divided_chunks = [] while original_chunk_size > 0: divided_chunks.append(min(original_chunk_size, chunk_size_limit)) original_chunk_size -= chunk_size_limit new_chunk_sizes.extend(divided_chunks) chunk_sizes = new_chunk_sizes print("%d chunks" % (len(chunk_sizes),)) prev = None fragment_len = 0 for chunk_size in chunk_sizes: t0 = time.time() prev = new_fragment._append_to_fragment( prev, fragment_len, str(self.__rec.seq[fragment_len : fragment_len + chunk_size]), ) fragment_len += chunk_size print("add chunk to fragment: %.4f\r" % (time.time() - t0,), end="") return new_fragment