def testSubcomponent(self): self.assertEquals(self.testees[0].subcomponent, None) uri = random_uri() self.uris.append(uri) com = sbol.DNAComponent(self.doc, uri) self.testees[0].subcomponent = com self.assertEquals(self.testees[0].subcomponent, com)
def initialize_design(doc): n_designs = len([part for part in doc.components if part.type and part.type == DESIGN]) root = sbol.DNAComponent(doc, '%s/Design_%d' %(BASE_URI, n_designs + 1)) root.type = DESIGN root.display_id = 'Design %d' %(n_designs + 1) root.name = 'Design %d' %(n_designs + 1) root.sequence = sbol.DNASequence(doc, '%s/Design_%d/Seq_%d' %(BASE_URI, n_designs + 1, n_designs + 1)) root.sequence.nucleotides = 'n' return root
def testComponents(self): col = self.testees[0] for n in range(NUM_SLOW_TESTS): self.assertEqual(len(col.components), n) uri = random_uri() self.uris.append(uri) com = sbol.DNAComponent(self.doc, uri) self.assertFalse(com in col.components) col.components += com self.assertTrue(com in col.components) self.assertEqual(len(col.components), n + 1)
def sbol(self): doc = sbol.Document() dc = sbol.DNAComponent(doc, "#" + str(self.pk) + "_con") dc.display_id = str(self.name) dc.description = str(self.description) dc.sequence = sbol.DNASequence(doc, "#" + str(self.pk) + "_seq") dc.sequence.nucleotides = str(self.sequence()) fid = [1] l = self.length() def makeAnnot(f, parent=dc): if self.shape == 'c' and f.end > l and parent == dc: end = f.end f.end = l makeAnnot(f, parent) f.end = end - l f.start = 0 makeAnnot(f, parent) return dcf = sbol.DNAComponent(doc, "#dc_" + str(fid[0])) dcf.display_id = str(f.type) dcf.description = str(";".join(["%s:%s" % (q.name,q.data) for q in f.qualifiers.all()])) sa = sbol.SequenceAnnotation(doc, "#sa_" + str(fid[0])) sa.subcomponent = dcf if f.direction == 'f': sa.strand = '+' else: sa.strand = '-' sa.start = f.start + 1 # SBOL 1-based sa.end = f.end + 1 parent.annotations.append(sa) fid[0] += 1 return dcf fragments = {} fragmentfeats = {} for f in self.features(False): # sub-annotations relative to parent try: if f.gene is not None: if f.type == "fragment": fragments[f.gene] = f else: if f.gene in fragmentfeats: fragmentfeats[f.gene].append(f) else: fragmentfeats[f.gene] = [f] except: makeAnnot(f) for gene, fragment in fragments.iteritems(): dcf = makeAnnot(fragment) dcf.description = str(gene.description) if gene in fragmentfeats: for f in fragmentfeats[gene]: makeAnnot(f, dcf) return str(doc)
def makeAnnot(f, parent=dc): if self.shape == 'c' and f.end > l and parent == dc: end = f.end f.end = l makeAnnot(f, parent) f.end = end - l f.start = 0 makeAnnot(f, parent) return dcf = sbol.DNAComponent(doc, "#dc_" + str(fid[0])) dcf.display_id = str(f.type) dcf.description = str(";".join(["%s:%s" % (q.name,q.data) for q in f.qualifiers.all()])) sa = sbol.SequenceAnnotation(doc, "#sa_" + str(fid[0])) sa.subcomponent = dcf if f.direction == 'f': sa.strand = '+' else: sa.strand = '-' sa.start = f.start + 1 # SBOL 1-based sa.end = f.end + 1 parent.annotations.append(sa) fid[0] += 1 return dcf
def qc(design, data=None, infile=None): if infile: with open(infile, "r") as f: data = f.read() if data: if len(parse_fasta(data)) > 1: multialignment = align(data) clone = find_consensus(multialignment) else: clone = data target_design = write_to_fasta([(design.uri, design.sequence.nucleotides)]) alignment_qc = align(target_design + '\r\n' + clone, outfile='%s.align' % design.display_id) # Scan alignment and classify mutations design_seq = design.sequence.nucleotides reference_seq = parse_fasta(alignment_qc)[0][1][:] query_seq = parse_fasta(alignment_qc)[1][1][:] assert len(reference_seq) == len(query_seq) # Translate alignment coordinates into coordinates of the reference and query sequences l_alignment = len(reference_seq) # Determine length of alignment l_ref = len(reference_seq.replace('-', '')) l_que = len(query_seq.replace('-', '')) # The following dictionaries are used like lists indexed from one ref_map = { } # Maps nucleotide coordinates of reference sequence to alignment coordinates i_ref = 0 # If the design sequence is not fully covered by sequencing data, there may be '---' padding the end of # the query sequence. The following indices mark the padded regions of the query_seq # Eg, # ref actggtca # qry --tggt-- # i_left = query_seq.index( next(token for token in query_seq if not token == '-')) i_right = len(query_seq) - query_seq[::-1].index( next(token for token in reversed(query_seq) if not token == '-')) for i_alignment in range(l_alignment): ref_base = reference_seq[i_alignment] que_base = query_seq[i_alignment] if not ref_base == '-': i_ref += 1 # Do not map the design coordinates to alignment coordinates if they aren't covered if i_alignment >= i_left and i_alignment <= i_right: ref_map[i_ref] = i_alignment # Should be a unit test #for i in range(0, l_ref): # assert design_sequence[i] == reference_seq[ref_map[i+1]], "%d %s does not match %s"%(i,design_sequence[i], reference_seq[ref_map[i+1]]) # Only leaf annotations at the bottom of the hierarchy are annotated... leaf_annotations = [] for i_design in range(len(design_seq)): target_annotations = getSequenceAnnotationsAtBaseNo( design, i_design) for ann in target_annotations: if not ann in leaf_annotations: leaf_annotations.append(ann) # Slice the alignment into segments that pertain to each annotation, # then determine the covered bases in the annotation. All, part, or several discontiguous parts of an annotation # may be covered for i_ann, ann in enumerate(leaf_annotations): covered_coordinates = list( ref_map.keys() ) # List of all base coordinates for this design / reference sequence that are covered # Now narrow down to find just the bases in this annotation covered_coordinates = [ x for x in covered_coordinates if x >= ann.start and x <= ann.end ] # Now translate into alignment coordinates alignment_coordinates = [ref_map[x] for x in covered_coordinates] if len(alignment_coordinates) > 0: alignment_start = min(alignment_coordinates) alignment_end = max(alignment_coordinates) # Scan alignment print("Verifying %s from %d to %d" % (ann.subcomponent.display_id, ann.start, ann.end)) print(''.join([ nt for nt in reference_seq[alignment_start:alignment_end] ])) print(''.join( [nt for nt in query_seq[alignment_start:alignment_end]])) # Classification of alignment base_comparisons = [ verify_base(reference_seq[x], query_seq[x]) for x in alignment_coordinates ] for x in alignment_coordinates: comparison = verify_base(reference_seq[x], query_seq[x]) if comparison == None: print(x, reference_seq[x], query_seq[x]) # Select a contiguous region of interest in alignment coordinates # TODO: replace while with for i_alignment = 0 regions = [] region_classifications = [] while i_alignment < len(base_comparisons): current_term = base_comparisons[i_alignment] if i_alignment == 0: reg_start = 0 reg_end = 0 previous_term = None elif i_alignment > 0 and i_alignment < ( len(base_comparisons) - 1): # Mark end of an old region of interest and beginning of a new region if not current_term == previous_term: ref_start = covered_coordinates[ reg_start] # Translate from alignment to design / reference coordinates ref_end = covered_coordinates[ reg_end] # Translate from alignment to design / reference coordinates region_of_interest = ((ref_start, ref_end), previous_term) regions.append(region_of_interest) reg_start = i_alignment reg_end = i_alignment # Else extend the old region of interest to include the current coordinate elif current_term == previous_term: reg_end = i_alignment elif i_alignment == (len(base_comparisons) - 1): if not current_term == previous_term: reg_start = i_alignment reg_end = i_alignment ref_start = covered_coordinates[ reg_start] # Translate from alignment to design / reference coordinates ref_end = covered_coordinates[ reg_end] # Translate from alignment to design / reference coordinates region_of_interest = ((ref_start, ref_end), previous_term) regions.append(region_of_interest) elif current_term == previous_term: reg_end = i_alignment ref_start = covered_coordinates[ reg_start] # Translate from alignment to design / reference coordinates ref_end = covered_coordinates[ reg_end] # Translate from alignment to design / reference coordinates region_of_interest = ((ref_start, ref_end), previous_term) regions.append(region_of_interest) #print i_alignment, current_term, reg_start, reg_end, covered_coordinates[reg_start], covered_coordinates[reg_end] previous_term = current_term i_alignment += 1 # TODO: add unit test checking that the first region starts and the last region ends # TODO: add unit test checking that two distinct regions of interest can be demarcated # TODO: add unit test checking a single base region of interest at the beginning or the start # TODO: add unit test checking if first or last bases of query are '-'. These are currently classified as # insertions, but are in fact uncovered regions # Create SequenceAnnotations for QC'd regions doc = design.doc for i_region, region in enumerate(regions): print(i_region) qc_start, qc_end = region[0] qc_classification = region[1] n_components = len(doc.components) n_annotations = len(doc.annotations) if qc_classification: if qc_classification == SO_NUCLEOTIDE_MATCH: # The reference sequence matches the query sequence annotated_region = sbol.SequenceAnnotation( doc, "%s/MatchedSequence/SA%d" % (design.uri, n_annotations)) annotated_region.start = qc_start annotated_region.end = qc_end annotated_region.subcomponent = sbol.DNAComponent( doc, "%s/MatchedSequence/SA%d/DC%d" % (design.uri, n_annotations, n_components)) annotated_region.subcomponent.display_id = "" annotated_region.subcomponent.type = qc_classification else: # A mismatch was identified annotated_region = sbol.SequenceAnnotation( doc, "%s/AssemblyErrors/SA%d" % (design.uri, n_annotations)) annotated_region.start = qc_start annotated_region.end = qc_end annotated_region.subcomponent = sbol.DNAComponent( doc, "%s/AssemblyErrors/SA%d/DC%d" % (design.uri, n_annotations, n_components)) annotated_region.subcomponent.display_id = "" annotated_region.subcomponent.type = qc_classification print("Adding %s to %s from %d to %d" % (annotated_region.uri, ann.subcomponent.display_id, annotated_region.start, annotated_region.end)) ann.subcomponent.annotations.append(annotated_region)
def createTestees(self): uri = random_uri() self.uris.append(uri) self.testees.append(sbol.DNAComponent(self.doc, uri))