Ejemplo n.º 1
0
def simulate_heterodimer_grammar(motif1, motif2, seq_length, min_spacing,
                                 max_spacing, num_pos, num_neg, GC_fraction):
  """
    Simulates two classes of sequences with motif1 and motif2:
        - Positive class sequences with motif1 and motif2 positioned
          min_spacing and max_spacing
        - Negative class sequences with independent motif1 and motif2 positioned
        anywhere in the sequence, not as a heterodimer grammar

    Parameters
    ----------
    seq_length : int, length of sequence
    GC_fraction : float, GC fraction in background sequence
    num_pos : int, number of positive class sequences
    num_neg : int, number of negatice class sequences
    motif1 : str, encode motif name
    motif2 : str, encode motif name
    min_spacing : int, minimum inter motif spacing
    max_spacing : int, maximum inter motif spacing

    Returns
    -------
    sequence_arr : 1darray
        Array with sequence strings.
    y : 1darray
        Array with positive/negative class labels.
    embedding_arr: list
        List of embedding objects.
    """
  import simdna
  from simdna import synthetic
  loaded_motifs = synthetic.LoadedEncodeMotifs(
      simdna.ENCODE_MOTIFS_PATH, pseudocountProb=0.001)
  motif1_generator = synthetic.ReverseComplementWrapper(
      synthetic.PwmSamplerFromLoadedMotifs(loaded_motifs, motif1))
  motif2_generator = synthetic.ReverseComplementWrapper(
      synthetic.PwmSamplerFromLoadedMotifs(loaded_motifs, motif2))
  separation_generator = synthetic.UniformIntegerGenerator(
      min_spacing, max_spacing)
  embedder = synthetic.EmbeddableEmbedder(
      synthetic.PairEmbeddableGenerator(motif1_generator, motif2_generator,
                                        separation_generator))
  embed_in_background = synthetic.EmbedInABackground(
      synthetic.ZeroOrderBackgroundGenerator(
          seq_length, discreteDistribution=get_distribution(GC_fraction)),
      [embedder])
  generated_sequences = tuple(
      synthetic.GenerateSequenceNTimes(embed_in_background,
                                       num_pos).generateSequences())
  grammar_sequence_arr = np.array(
      [generated_seq.seq for generated_seq in generated_sequences])
  positive_embedding_arr = [
      generated_seq.embeddings for generated_seq in generated_sequences
  ]
  nongrammar_sequence_arr, _, negative_embedding_arr = simulate_multi_motif_embedding(
      [motif1, motif2], seq_length, 2, 2, num_neg, GC_fraction)
  sequence_arr = np.concatenate((grammar_sequence_arr, nongrammar_sequence_arr))
  y = np.array([[True]] * num_pos + [[False]] * num_neg)
  embedding_arr = positive_embedding_arr + negative_embedding_arr
  return sequence_arr, y, embedding_arr
Ejemplo n.º 2
0
    def test_simple_motif_grammar(self):
        seq_len = 100
        min_sep = 2
        max_sep = 6
        random.seed(1234)
        np.random.seed(1234)
        num_sequences = 4000
        loaded_motifs = sn.LoadedEncodeMotifs(
                         simdna.ENCODE_MOTIFS_PATH,
                         pseudocountProb=0.001)
        motif1_generator = sn.PwmSamplerFromLoadedMotifs(
                            loaded_motifs, "SIX5_known5")
        motif2_generator = sn.PwmSamplerFromLoadedMotifs(
                            loaded_motifs, "ZNF143_known2")
        separation_generator = sn.UniformIntegerGenerator(min_sep,max_sep)
        embedder = sn.EmbeddableEmbedder(
                    sn.PairEmbeddableGenerator(
                     motif1_generator, motif2_generator, separation_generator))
        embed_in_background = sn.EmbedInABackground(
                               sn.ZeroOrderBackgroundGenerator(seq_len),
                               [embedder])
        generated_sequences = sn.GenerateSequenceNTimes(
                        embed_in_background, num_sequences).generateSequences()
        generated_seqs = [seq for seq in generated_sequences]
        separations = defaultdict(lambda: 0) 
        for seq in generated_seqs:
            assert len(seq.seq) == seq_len
            embedding1 = seq.embeddings[0]
            embedding2 = seq.embeddings[1]
            embedding3 = seq.embeddings[2]
            assert len(embedding1.what) == len(embedding1.what.string)
            assert len(embedding2.what) == len(embedding2.what.string)
            assert len(embedding3.what) == (len(embedding1.what)+
                                            len(embedding2.what)+
                                            embedding3.what.separation)
            #testing that the string of the first motif is placed correctly
            assert (seq.seq[
             embedding1.startPos:embedding1.startPos+len(embedding1.what)]
             == embedding1.what.string)
            #testing that the string of the second motif is placed correctly
            assert (seq.seq[
             embedding2.startPos:embedding2.startPos+len(embedding2.what)]
             == embedding2.what.string) 
            #testing that the motifs are placed correctly
            assert ((embedding2.startPos - (embedding1.startPos
                                          + len(embedding1.what.string)))
                     == embedding3.what.separation)
            #test separation is within the right limits 
            assert embedding3.what.separation >= min_sep 
            assert embedding3.what.separation <= max_sep
            #log the separation; will later test distribution
            separations[embedding3.what.separation] += 1

        for possible_sep in range(min_sep, max_sep+1):
            np.testing.assert_almost_equal(
             separations[possible_sep]/float(num_sequences),
             1.0/(max_sep-min_sep+1),2)
             
def variableSpacingGrammar(options):
    pc = 0.001
    pathToMotifs = options.pathToMotifs
    loadedMotifs = synthetic.LoadedEncodeMotifs(pathToMotifs, pseudocountProb=pc)
    motifName1 = options.motifName1
    motifName2 = options.motifName2
    seqLength = options.seqLength
    numSeq = options.numSeq
    outputFileName = ("variableSpacingGrammarSimulation_"
                      +"prefix-"+options.prefix
                      +"_motif1-"+motifName1+"_motif2-"+motifName2
                      +"_seqLength"+str(seqLength)+"_numSeq"
                      +str(numSeq)+".simdata")

    kwargs={'loadedMotifs':loadedMotifs}
    theClass=synthetic.PwmSamplerFromLoadedMotifs
    motif1Generator=theClass(motifName=motifName1,**kwargs)
    motif2Generator=theClass(motifName=motifName2,**kwargs)
    motif1Embedder=synthetic.SubstringEmbedder(substringGenerator=motif1Generator)
    motif2Embedder=synthetic.SubstringEmbedder(substringGenerator=motif2Generator)

    embedders = []
    separationGenerator=synthetic.MinMaxWrapper(
        synthetic.PoissonQuantityGenerator(options.meanSpacing),
        theMin=options.minSpacing,
        theMax=options.maxSpacing) 
    embedders.append(synthetic.EmbeddableEmbedder(
                        embeddableGenerator=synthetic.PairEmbeddableGenerator(
                            embeddableGenerator1=motif1Generator
                            ,embeddableGenerator2=motif2Generator
                            ,separationGenerator=separationGenerator
                        )
                    ))

    embedInBackground = synthetic.EmbedInABackground(
        backgroundGenerator=synthetic.ZeroOrderBackgroundGenerator(seqLength) 
        , embedders=embedders
    )

    sequenceSet = synthetic.GenerateSequenceNTimes(embedInBackground, numSeq)
    synthetic.printSequences(outputFileName, sequenceSet,
                             includeFasta=True, includeEmbeddings=True,
                             prefix=options.prefix)
Ejemplo n.º 4
0
def motifGrammarSimulation(options):
    pc = 0.001
    bestHit = options.bestHit
    pathToMotifs = options.pathToMotifs
    loadedMotifs = synthetic.LoadedEncodeMotifs(pathToMotifs,
                                                pseudocountProb=pc)
    motifName1 = options.motifName1
    motifName2 = options.motifName2
    seqLength = options.seqLength
    numSeq = options.numSeq
    generationSetting = options.generationSetting
    outputFileName = "motifGrammarSimulation_" + generationSetting + (
        "_bestHit" if bestHit else "")
    if (generationSetting is not generationSettings.singleMotif2):
        outputFileName += "_motif1-" + motifName1
    if (generationSetting is not generationSettings.singleMotif1):
        outputFileName += "_motif2-" + motifName2
    outputFileName += "_seqLength" + str(seqLength) + "_numSeq" + str(
        numSeq) + ".simdata"

    kwargs = {'loadedMotifs': loadedMotifs}
    if (bestHit):
        theClass = synthetic.BestHitPwmFromLoadedMotifs
    else:
        theClass = synthetic.PwmSamplerFromLoadedMotifs

    motif1Generator = theClass(motifName=motifName1, **kwargs)
    motif2Generator = theClass(motifName=motifName2, **kwargs)
    motif1Embedder = synthetic.SubstringEmbedder(
        substringGenerator=motif1Generator)
    motif2Embedder = synthetic.SubstringEmbedder(
        substringGenerator=motif2Generator)

    embedders = []
    if (generationSetting == generationSettings.allBackground
            or generationSetting == generationSettings.twoMotifs):
        namePrefix = "synthNeg"
    else:
        namePrefix = "synthPos"
    if (generationSetting == generationSettings.allBackground):
        pass
    elif (generationSetting in [
            generationSettings.singleMotif1, generationSettings.twoMotifs,
            generationSettings.singleMotif2
    ]):
        if (generationSetting == generationSettings.singleMotif1):
            embedders.append(motif1Embedder)
        elif (generationSetting == generationSettings.singleMotif2):
            embedders.append(motif2Embedder)
        elif (generationSetting == generationSettings.twoMotifs):
            embedders.append(motif1Embedder)
            embedders.append(motif2Embedder)
        else:
            raise RuntimeError("Unsupported generation setting: " +
                               generationSetting)
    elif (generationSetting in [
            generationSettings.twoMotifsFixedSpacing,
            generationSettings.twoMotifsVariableSpacing
    ]):
        if (generationSetting == generationSettings.twoMotifsFixedSpacing):
            separationGenerator = synthetic.FixedQuantityGenerator(
                options.fixedSpacingOrMinSpacing)
        elif (generationSetting == generationSettings.twoMotifsVariableSpacing
              ):
            separationGenerator = synthetic.UniformIntegerGenerator(
                minVal=options.fixedSpacingOrMinSpacing,
                maxVal=options.maxSpacing)
        else:
            raise RuntimeError("unsupported generationSetting:" +
                               generationSetting)
        embedders.append(
            synthetic.EmbeddableEmbedder(
                embeddableGenerator=synthetic.PairEmbeddableGenerator(
                    embeddableGenerator1=motif1Generator,
                    embeddableGenerator2=motif2Generator,
                    separationGenerator=separationGenerator)))
    else:
        raise RuntimeError("unsupported generationSetting:" +
                           generationSetting)

    embedInBackground = synthetic.EmbedInABackground(
        backgroundGenerator=synthetic.ZeroOrderBackgroundGenerator(seqLength),
        embedders=embedders,
        namePrefix=namePrefix)

    sequenceSet = synthetic.GenerateSequenceNTimes(embedInBackground, numSeq)
    synthetic.printSequences(outputFileName,
                             sequenceSet,
                             includeFasta=True,
                             includeEmbeddings=True)