Exemple #1
0
def censor_record(
    record,
    record_id="censored",
    label_generator="feature_%d",
    keep_topology=False,
    anonymise_features=True,
    preserve_sites=None,
):
    """Return a record with random sequence and censored annotations/features.

    Useful for creating example files or anonymising sequences for bug reports.


    Parameters
    ----------

    record
      The record to be anonymized.

    record_id
      ID of the new record.

    label_generator
      Recipe to change feature labels. Either ``"feature_%d"`` or ``None`` (no label)
      of a function (i, feature)=>label.

    keep_topology
      Whether to keep the record topology or not.

    anonymise_features
      Whether to replace feature labels and ID/name, or not.

    preserve_sites
      List of enzyme sites to keep. Example: ``["BsmBI", "BsaI"]``. Preserves the
      sequence around cut sites of the specified enzymes.
    """
    # Anonymise
    if anonymise_features:
        new_record = anonymized_record(
            record, record_id=record_id, label_generator=label_generator
        )
    else:
        new_record = deepcopy(record)

    if keep_topology:
        try:
            new_record.annotations["topology"] = record.annotations["topology"]
        except KeyError:  # input may not have topology set
            pass

    # Randomise
    new_seq = random_dna_sequence(
        len(new_record), gc_share=None, probas=None, seed=None
    )

    if preserve_sites:
        restriction_batch = Restriction.RestrictionBatch(preserve_sites)
        # Destroy random new enzyme sites:
        analysis = Restriction.Analysis(restriction_batch, sequence=Seq(new_seq))
        analysis_results = analysis.full()
        for enzyme, hits in analysis_results.items():
            for hit in hits:
                # 10 bp up- and downstream destroys the site whichever strand it is on:
                if hit - 10 < 0:  # handle edge cases
                    start = 0
                    upstream = "A" * hit
                else:
                    start = hit - 10
                    upstream = "A" * 10
                if hit + 10 > len(new_seq):
                    end = len(new_seq)
                    downstream = "A" * (len(new_seq) - hit)
                else:
                    end = hit + 10
                    downstream = "A" * 10
                replacement = upstream + downstream
                new_seq = easy_dna.replace_segment(new_seq, start, end, replacement)

        # Add original sites:
        analysis = Restriction.Analysis(restriction_batch, sequence=record.seq)
        analysis_results = analysis.full()
        original_seq = str(record.seq)
        for enzyme, hits in analysis_results.items():
            for hit in hits:
                # keep 12 bp surrounding the cut site, to capture enzyme site:
                if hit - 12 < 0:  # handle edge cases
                    start = 0
                else:
                    start = hit - 12
                if hit + 12 > len(new_seq):
                    end = len(new_seq)
                else:
                    end = hit + 12
                original_segment = original_seq[start:end]
                new_seq = easy_dna.replace_segment(
                    new_seq, start, end, original_segment
                )

    censored_record = record_with_different_sequence(new_record, new_seq)

    return censored_record
Exemple #2
0
 def createAnalysis(self, seq_str, batch_ary):
     """Restriction.Analysis creation helper method."""
     rb = Restriction.RestrictionBatch(batch_ary)
     seq = Seq(seq_str)
     return Restriction.Analysis(rb, seq)