Example #1
0
def construct_design(doc, root, target_design):
    # target_design is a list of part uris
    n_components = len(doc.components)
    n_annotations = len(doc.annotations)
    n_sequences = len(doc.sequences)

    sbol_parts = []
    for uri in target_design:
        #uri = uris[part_name]
        part = doc.components[uri]
        SA = sbol.SequenceAnnotation(
            doc, '%s/SA_%d' % (root.uri, n_annotations + 1))
        n_annotations += 1
        if not part.sequence:
            part.sequence = sbol.DNASequence(
                doc, '%s/Seq_%d' % (part.uri, n_sequences + 1))
            part.sequence.nucleotides = 'n'
        SA.start = 1
        SA.end = len(part.sequence.nucleotides)
        SA.orientation = '+'
        sbol_parts.append(SA)
        SA.subcomponent = part

    root.annotations.append(sbol_parts[0])
    for i_part in range(1, len(sbol_parts)):
        upstream_ann = sbol_parts[i_part - 1]
        downstream_ann = sbol_parts[i_part]
        insert_annotation_downstream(root, upstream_ann, downstream_ann)

    assemble_subcomponents(root)
    #for part in sbol_parts:
    #    print part.start, part.end, part.subcomponent.name, part.subcomponent.type, part.subcomponent.uri

    return root
Example #2
0
 def testAnnotations(self):
     for n in range(NUM_SLOW_TESTS):
         self.assertEqual(len(self.testees[0].annotations), n)
         uri = random_uri()
         self.uris.append(uri)
         ann = sbol.SequenceAnnotation(self.doc, uri)
         self.assertFalse(ann in self.testees[0].annotations)
         self.testees[0].annotations += ann
         self.assertTrue(ann in self.testees[0].annotations)
Example #3
0
 def makeAnnot(f, parent=dc):
     if self.shape == 'c' and f.end > l and parent == dc:
         end = f.end
         f.end = l
         makeAnnot(f, parent)
         f.end = end - l
         f.start = 0
         makeAnnot(f, parent)
         return
     dcf = sbol.DNAComponent(doc, "#dc_" + str(fid[0]))
     dcf.display_id = str(f.type)
     dcf.description = str(";".join(["%s:%s" % (q.name,q.data) for q in f.qualifiers.all()]))
     sa = sbol.SequenceAnnotation(doc, "#sa_" + str(fid[0]))
     sa.subcomponent = dcf
     if f.direction == 'f':
         sa.strand = '+'
     else:
         sa.strand = '-'
     sa.start = f.start + 1 # SBOL 1-based
     sa.end = f.end + 1
     parent.annotations.append(sa)
     fid[0] += 1
     return dcf
Example #4
0
def qc(design, data=None, infile=None):
    if infile:
        with open(infile, "r") as f:
            data = f.read()
    if data:
        if len(parse_fasta(data)) > 1:
            multialignment = align(data)
            clone = find_consensus(multialignment)
        else:
            clone = data
        target_design = write_to_fasta([(design.uri,
                                         design.sequence.nucleotides)])
        alignment_qc = align(target_design + '\r\n' + clone,
                             outfile='%s.align' % design.display_id)

        # Scan alignment and classify mutations
        design_seq = design.sequence.nucleotides
        reference_seq = parse_fasta(alignment_qc)[0][1][:]
        query_seq = parse_fasta(alignment_qc)[1][1][:]
        assert len(reference_seq) == len(query_seq)

        # Translate alignment coordinates into coordinates of the reference and query sequences
        l_alignment = len(reference_seq)  # Determine length of alignment
        l_ref = len(reference_seq.replace('-', ''))
        l_que = len(query_seq.replace('-', ''))

        # The following dictionaries are used like lists indexed from one
        ref_map = {
        }  # Maps nucleotide coordinates of reference sequence to alignment coordinates
        i_ref = 0

        # If the design sequence is not fully covered by sequencing data, there may be '---' padding the end of
        # the query sequence.  The following indices mark the padded regions of the query_seq
        # Eg,
        # ref actggtca
        # qry --tggt--
        #
        i_left = query_seq.index(
            next(token for token in query_seq if not token == '-'))
        i_right = len(query_seq) - query_seq[::-1].index(
            next(token for token in reversed(query_seq) if not token == '-'))

        for i_alignment in range(l_alignment):
            ref_base = reference_seq[i_alignment]
            que_base = query_seq[i_alignment]
            if not ref_base == '-':
                i_ref += 1
            # Do not map the design coordinates to alignment coordinates if they aren't covered
            if i_alignment >= i_left and i_alignment <= i_right:
                ref_map[i_ref] = i_alignment

        # Should be a unit test
        #for i in range(0, l_ref):
        #    assert design_sequence[i] == reference_seq[ref_map[i+1]], "%d %s does not match %s"%(i,design_sequence[i], reference_seq[ref_map[i+1]])

        # Only leaf annotations at the bottom of the hierarchy are annotated...
        leaf_annotations = []
        for i_design in range(len(design_seq)):
            target_annotations = getSequenceAnnotationsAtBaseNo(
                design, i_design)
            for ann in target_annotations:
                if not ann in leaf_annotations:
                    leaf_annotations.append(ann)

        # Slice the alignment into segments that pertain to each annotation,
        # then determine the covered bases in the annotation.  All, part, or several discontiguous parts of an annotation
        # may be covered
        for i_ann, ann in enumerate(leaf_annotations):
            covered_coordinates = list(
                ref_map.keys()
            )  # List of all base coordinates for this design / reference sequence that are covered
            # Now narrow down to find just the bases in this annotation
            covered_coordinates = [
                x for x in covered_coordinates
                if x >= ann.start and x <= ann.end
            ]
            # Now translate into alignment coordinates
            alignment_coordinates = [ref_map[x] for x in covered_coordinates]
            if len(alignment_coordinates) > 0:

                alignment_start = min(alignment_coordinates)
                alignment_end = max(alignment_coordinates)

                # Scan alignment
                print("Verifying %s from %d to %d" %
                      (ann.subcomponent.display_id, ann.start, ann.end))
                print(''.join([
                    nt for nt in reference_seq[alignment_start:alignment_end]
                ]))
                print(''.join(
                    [nt for nt in query_seq[alignment_start:alignment_end]]))

                # Classification of alignment
                base_comparisons = [
                    verify_base(reference_seq[x], query_seq[x])
                    for x in alignment_coordinates
                ]
                for x in alignment_coordinates:
                    comparison = verify_base(reference_seq[x], query_seq[x])
                    if comparison == None:
                        print(x, reference_seq[x], query_seq[x])
                # Select a contiguous region of interest in alignment coordinates
                # TODO: replace while with for
                i_alignment = 0
                regions = []
                region_classifications = []
                while i_alignment < len(base_comparisons):
                    current_term = base_comparisons[i_alignment]
                    if i_alignment == 0:
                        reg_start = 0
                        reg_end = 0
                        previous_term = None
                    elif i_alignment > 0 and i_alignment < (
                            len(base_comparisons) - 1):
                        # Mark end of an old region of interest and beginning of a new region
                        if not current_term == previous_term:
                            ref_start = covered_coordinates[
                                reg_start]  # Translate from alignment to design / reference coordinates
                            ref_end = covered_coordinates[
                                reg_end]  # Translate from alignment to design / reference coordinates
                            region_of_interest = ((ref_start, ref_end),
                                                  previous_term)
                            regions.append(region_of_interest)
                            reg_start = i_alignment
                            reg_end = i_alignment
                        # Else extend the old region of interest to include the current coordinate
                        elif current_term == previous_term:
                            reg_end = i_alignment
                    elif i_alignment == (len(base_comparisons) - 1):
                        if not current_term == previous_term:
                            reg_start = i_alignment
                            reg_end = i_alignment
                            ref_start = covered_coordinates[
                                reg_start]  # Translate from alignment to design / reference coordinates
                            ref_end = covered_coordinates[
                                reg_end]  # Translate from alignment to design / reference coordinates
                            region_of_interest = ((ref_start, ref_end),
                                                  previous_term)
                            regions.append(region_of_interest)
                        elif current_term == previous_term:
                            reg_end = i_alignment
                            ref_start = covered_coordinates[
                                reg_start]  # Translate from alignment to design / reference coordinates
                            ref_end = covered_coordinates[
                                reg_end]  # Translate from alignment to design / reference coordinates
                            region_of_interest = ((ref_start, ref_end),
                                                  previous_term)
                            regions.append(region_of_interest)
                    #print i_alignment, current_term, reg_start, reg_end, covered_coordinates[reg_start], covered_coordinates[reg_end]
                    previous_term = current_term
                    i_alignment += 1

                # TODO: add unit test checking that the first region starts and the last region ends

                # TODO: add unit test checking that two distinct regions of interest can be demarcated

                # TODO: add unit test checking a single base region of interest at the beginning or the start

                # TODO: add unit test checking if first or last bases of query are '-'.  These are currently classified as
                # insertions, but are in fact uncovered regions

                # Create SequenceAnnotations for QC'd regions
                doc = design.doc
                for i_region, region in enumerate(regions):
                    print(i_region)
                    qc_start, qc_end = region[0]
                    qc_classification = region[1]
                    n_components = len(doc.components)
                    n_annotations = len(doc.annotations)
                    if qc_classification:
                        if qc_classification == SO_NUCLEOTIDE_MATCH:  # The reference sequence matches the query sequence
                            annotated_region = sbol.SequenceAnnotation(
                                doc, "%s/MatchedSequence/SA%d" %
                                (design.uri, n_annotations))
                            annotated_region.start = qc_start
                            annotated_region.end = qc_end
                            annotated_region.subcomponent = sbol.DNAComponent(
                                doc, "%s/MatchedSequence/SA%d/DC%d" %
                                (design.uri, n_annotations, n_components))
                            annotated_region.subcomponent.display_id = ""
                            annotated_region.subcomponent.type = qc_classification
                        else:  # A mismatch was identified
                            annotated_region = sbol.SequenceAnnotation(
                                doc, "%s/AssemblyErrors/SA%d" %
                                (design.uri, n_annotations))
                            annotated_region.start = qc_start
                            annotated_region.end = qc_end
                            annotated_region.subcomponent = sbol.DNAComponent(
                                doc, "%s/AssemblyErrors/SA%d/DC%d" %
                                (design.uri, n_annotations, n_components))
                            annotated_region.subcomponent.display_id = ""
                            annotated_region.subcomponent.type = qc_classification
                    print("Adding %s to %s from %d to %d" %
                          (annotated_region.uri, ann.subcomponent.display_id,
                           annotated_region.start, annotated_region.end))
                    ann.subcomponent.annotations.append(annotated_region)
Example #5
0
 def createTestees(self):
     uri = random_uri()
     self.uris.append(uri)
     self.testees.append(sbol.SequenceAnnotation(self.doc, uri))