Beispiel #1
0
 def testSubcomponent(self):
     self.assertEquals(self.testees[0].subcomponent, None)
     uri = random_uri()
     self.uris.append(uri)
     com = sbol.DNAComponent(self.doc, uri)
     self.testees[0].subcomponent = com
     self.assertEquals(self.testees[0].subcomponent, com)
Beispiel #2
0
def initialize_design(doc):
    n_designs = len([part for part in doc.components if part.type and part.type == DESIGN])
    root = sbol.DNAComponent(doc, '%s/Design_%d' %(BASE_URI, n_designs + 1))
    root.type = DESIGN
    root.display_id = 'Design %d' %(n_designs + 1)
    root.name = 'Design %d' %(n_designs + 1)
    root.sequence = sbol.DNASequence(doc, '%s/Design_%d/Seq_%d' %(BASE_URI, n_designs + 1, n_designs + 1))
    root.sequence.nucleotides = 'n'
    return root
Beispiel #3
0
 def testComponents(self):
     col = self.testees[0]
     for n in range(NUM_SLOW_TESTS):
         self.assertEqual(len(col.components), n)
         uri = random_uri()
         self.uris.append(uri)
         com = sbol.DNAComponent(self.doc, uri)
         self.assertFalse(com in col.components)
         col.components += com
         self.assertTrue(com in col.components)
         self.assertEqual(len(col.components), n + 1)
Beispiel #4
0
 def sbol(self):
     doc = sbol.Document()
     dc = sbol.DNAComponent(doc, "#" + str(self.pk) + "_con")
     dc.display_id = str(self.name)
     dc.description = str(self.description)
     dc.sequence = sbol.DNASequence(doc, "#" + str(self.pk) + "_seq")
     dc.sequence.nucleotides = str(self.sequence())
     fid = [1]
     l = self.length()
     def makeAnnot(f, parent=dc):
         if self.shape == 'c' and f.end > l and parent == dc:
             end = f.end
             f.end = l
             makeAnnot(f, parent)
             f.end = end - l
             f.start = 0
             makeAnnot(f, parent)
             return
         dcf = sbol.DNAComponent(doc, "#dc_" + str(fid[0]))
         dcf.display_id = str(f.type)
         dcf.description = str(";".join(["%s:%s" % (q.name,q.data) for q in f.qualifiers.all()]))
         sa = sbol.SequenceAnnotation(doc, "#sa_" + str(fid[0]))
         sa.subcomponent = dcf
         if f.direction == 'f':
             sa.strand = '+'
         else:
             sa.strand = '-'
         sa.start = f.start + 1 # SBOL 1-based
         sa.end = f.end + 1
         parent.annotations.append(sa)
         fid[0] += 1
         return dcf
     fragments = {}
     fragmentfeats = {}
     for f in self.features(False): # sub-annotations relative to parent
         try:
             if f.gene is not None:
                 if f.type == "fragment":
                     fragments[f.gene] = f
                 else:
                     if f.gene in fragmentfeats:
                         fragmentfeats[f.gene].append(f)
                     else:
                         fragmentfeats[f.gene] = [f]
         except:
             makeAnnot(f)
     for gene, fragment in fragments.iteritems():
         dcf = makeAnnot(fragment)
         dcf.description = str(gene.description)
         if gene in fragmentfeats:
             for f in  fragmentfeats[gene]:
                 makeAnnot(f, dcf)
     return str(doc)
Beispiel #5
0
 def makeAnnot(f, parent=dc):
     if self.shape == 'c' and f.end > l and parent == dc:
         end = f.end
         f.end = l
         makeAnnot(f, parent)
         f.end = end - l
         f.start = 0
         makeAnnot(f, parent)
         return
     dcf = sbol.DNAComponent(doc, "#dc_" + str(fid[0]))
     dcf.display_id = str(f.type)
     dcf.description = str(";".join(["%s:%s" % (q.name,q.data) for q in f.qualifiers.all()]))
     sa = sbol.SequenceAnnotation(doc, "#sa_" + str(fid[0]))
     sa.subcomponent = dcf
     if f.direction == 'f':
         sa.strand = '+'
     else:
         sa.strand = '-'
     sa.start = f.start + 1 # SBOL 1-based
     sa.end = f.end + 1
     parent.annotations.append(sa)
     fid[0] += 1
     return dcf
Beispiel #6
0
def qc(design, data=None, infile=None):
    if infile:
        with open(infile, "r") as f:
            data = f.read()
    if data:
        if len(parse_fasta(data)) > 1:
            multialignment = align(data)
            clone = find_consensus(multialignment)
        else:
            clone = data
        target_design = write_to_fasta([(design.uri,
                                         design.sequence.nucleotides)])
        alignment_qc = align(target_design + '\r\n' + clone,
                             outfile='%s.align' % design.display_id)

        # Scan alignment and classify mutations
        design_seq = design.sequence.nucleotides
        reference_seq = parse_fasta(alignment_qc)[0][1][:]
        query_seq = parse_fasta(alignment_qc)[1][1][:]
        assert len(reference_seq) == len(query_seq)

        # Translate alignment coordinates into coordinates of the reference and query sequences
        l_alignment = len(reference_seq)  # Determine length of alignment
        l_ref = len(reference_seq.replace('-', ''))
        l_que = len(query_seq.replace('-', ''))

        # The following dictionaries are used like lists indexed from one
        ref_map = {
        }  # Maps nucleotide coordinates of reference sequence to alignment coordinates
        i_ref = 0

        # If the design sequence is not fully covered by sequencing data, there may be '---' padding the end of
        # the query sequence.  The following indices mark the padded regions of the query_seq
        # Eg,
        # ref actggtca
        # qry --tggt--
        #
        i_left = query_seq.index(
            next(token for token in query_seq if not token == '-'))
        i_right = len(query_seq) - query_seq[::-1].index(
            next(token for token in reversed(query_seq) if not token == '-'))

        for i_alignment in range(l_alignment):
            ref_base = reference_seq[i_alignment]
            que_base = query_seq[i_alignment]
            if not ref_base == '-':
                i_ref += 1
            # Do not map the design coordinates to alignment coordinates if they aren't covered
            if i_alignment >= i_left and i_alignment <= i_right:
                ref_map[i_ref] = i_alignment

        # Should be a unit test
        #for i in range(0, l_ref):
        #    assert design_sequence[i] == reference_seq[ref_map[i+1]], "%d %s does not match %s"%(i,design_sequence[i], reference_seq[ref_map[i+1]])

        # Only leaf annotations at the bottom of the hierarchy are annotated...
        leaf_annotations = []
        for i_design in range(len(design_seq)):
            target_annotations = getSequenceAnnotationsAtBaseNo(
                design, i_design)
            for ann in target_annotations:
                if not ann in leaf_annotations:
                    leaf_annotations.append(ann)

        # Slice the alignment into segments that pertain to each annotation,
        # then determine the covered bases in the annotation.  All, part, or several discontiguous parts of an annotation
        # may be covered
        for i_ann, ann in enumerate(leaf_annotations):
            covered_coordinates = list(
                ref_map.keys()
            )  # List of all base coordinates for this design / reference sequence that are covered
            # Now narrow down to find just the bases in this annotation
            covered_coordinates = [
                x for x in covered_coordinates
                if x >= ann.start and x <= ann.end
            ]
            # Now translate into alignment coordinates
            alignment_coordinates = [ref_map[x] for x in covered_coordinates]
            if len(alignment_coordinates) > 0:

                alignment_start = min(alignment_coordinates)
                alignment_end = max(alignment_coordinates)

                # Scan alignment
                print("Verifying %s from %d to %d" %
                      (ann.subcomponent.display_id, ann.start, ann.end))
                print(''.join([
                    nt for nt in reference_seq[alignment_start:alignment_end]
                ]))
                print(''.join(
                    [nt for nt in query_seq[alignment_start:alignment_end]]))

                # Classification of alignment
                base_comparisons = [
                    verify_base(reference_seq[x], query_seq[x])
                    for x in alignment_coordinates
                ]
                for x in alignment_coordinates:
                    comparison = verify_base(reference_seq[x], query_seq[x])
                    if comparison == None:
                        print(x, reference_seq[x], query_seq[x])
                # Select a contiguous region of interest in alignment coordinates
                # TODO: replace while with for
                i_alignment = 0
                regions = []
                region_classifications = []
                while i_alignment < len(base_comparisons):
                    current_term = base_comparisons[i_alignment]
                    if i_alignment == 0:
                        reg_start = 0
                        reg_end = 0
                        previous_term = None
                    elif i_alignment > 0 and i_alignment < (
                            len(base_comparisons) - 1):
                        # Mark end of an old region of interest and beginning of a new region
                        if not current_term == previous_term:
                            ref_start = covered_coordinates[
                                reg_start]  # Translate from alignment to design / reference coordinates
                            ref_end = covered_coordinates[
                                reg_end]  # Translate from alignment to design / reference coordinates
                            region_of_interest = ((ref_start, ref_end),
                                                  previous_term)
                            regions.append(region_of_interest)
                            reg_start = i_alignment
                            reg_end = i_alignment
                        # Else extend the old region of interest to include the current coordinate
                        elif current_term == previous_term:
                            reg_end = i_alignment
                    elif i_alignment == (len(base_comparisons) - 1):
                        if not current_term == previous_term:
                            reg_start = i_alignment
                            reg_end = i_alignment
                            ref_start = covered_coordinates[
                                reg_start]  # Translate from alignment to design / reference coordinates
                            ref_end = covered_coordinates[
                                reg_end]  # Translate from alignment to design / reference coordinates
                            region_of_interest = ((ref_start, ref_end),
                                                  previous_term)
                            regions.append(region_of_interest)
                        elif current_term == previous_term:
                            reg_end = i_alignment
                            ref_start = covered_coordinates[
                                reg_start]  # Translate from alignment to design / reference coordinates
                            ref_end = covered_coordinates[
                                reg_end]  # Translate from alignment to design / reference coordinates
                            region_of_interest = ((ref_start, ref_end),
                                                  previous_term)
                            regions.append(region_of_interest)
                    #print i_alignment, current_term, reg_start, reg_end, covered_coordinates[reg_start], covered_coordinates[reg_end]
                    previous_term = current_term
                    i_alignment += 1

                # TODO: add unit test checking that the first region starts and the last region ends

                # TODO: add unit test checking that two distinct regions of interest can be demarcated

                # TODO: add unit test checking a single base region of interest at the beginning or the start

                # TODO: add unit test checking if first or last bases of query are '-'.  These are currently classified as
                # insertions, but are in fact uncovered regions

                # Create SequenceAnnotations for QC'd regions
                doc = design.doc
                for i_region, region in enumerate(regions):
                    print(i_region)
                    qc_start, qc_end = region[0]
                    qc_classification = region[1]
                    n_components = len(doc.components)
                    n_annotations = len(doc.annotations)
                    if qc_classification:
                        if qc_classification == SO_NUCLEOTIDE_MATCH:  # The reference sequence matches the query sequence
                            annotated_region = sbol.SequenceAnnotation(
                                doc, "%s/MatchedSequence/SA%d" %
                                (design.uri, n_annotations))
                            annotated_region.start = qc_start
                            annotated_region.end = qc_end
                            annotated_region.subcomponent = sbol.DNAComponent(
                                doc, "%s/MatchedSequence/SA%d/DC%d" %
                                (design.uri, n_annotations, n_components))
                            annotated_region.subcomponent.display_id = ""
                            annotated_region.subcomponent.type = qc_classification
                        else:  # A mismatch was identified
                            annotated_region = sbol.SequenceAnnotation(
                                doc, "%s/AssemblyErrors/SA%d" %
                                (design.uri, n_annotations))
                            annotated_region.start = qc_start
                            annotated_region.end = qc_end
                            annotated_region.subcomponent = sbol.DNAComponent(
                                doc, "%s/AssemblyErrors/SA%d/DC%d" %
                                (design.uri, n_annotations, n_components))
                            annotated_region.subcomponent.display_id = ""
                            annotated_region.subcomponent.type = qc_classification
                    print("Adding %s to %s from %d to %d" %
                          (annotated_region.uri, ann.subcomponent.display_id,
                           annotated_region.start, annotated_region.end))
                    ann.subcomponent.annotations.append(annotated_region)
Beispiel #7
0
 def createTestees(self):
     uri = random_uri()
     self.uris.append(uri)
     self.testees.append(sbol.DNAComponent(self.doc, uri))