def read_simdata_file(simdata_file, one_hot_encode=False, ids_to_load=None): ids = [] sequences = [] embeddings = [] labels = [] if (ids_to_load is not None): ids_to_load = set(ids_to_load) def action(inp, line_number): if (line_number > 1): if (ids_to_load is None or (inp[0] in ids_to_load)): ids.append(inp[0]) sequences.append(inp[1]) embeddings.append(getEmbeddingsFromString(inp[2])) labels.append([int(x) for x in inp[3:]]) util.perform_action_on_each_line_of_file( file_handle=util.get_file_handle(simdata_file), action=action, transformation=util.default_tab_seppd) return util.enum( ids=ids, sequences=sequences, embeddings=embeddings, labels=np.array(labels))
#!/usr/bin/env python import os import sys import simdna import simdna.util as util import simdna.synthetic as synthetic import simdna.pwm as pwm generationSettings = util.enum( allBackground="allBackground", singleMotif1="singleMotif1" #embeds first motif , singleMotif2="singleMotif2" #embeds second motif , twoMotifs="twoMotifs" #embeds one of both motifs , twoMotifsFixedSpacing= "twoMotifsFixedSpacing" #embeds both motifs with a fixed spacing , twoMotifsVariableSpacing= "twoMotifsVariableSpacing" #embeds both motifs with a variable spacing ) def motifGrammarSimulation(options): pc = 0.001 bestHit = options.bestHit pathToMotifs = options.pathToMotifs loadedMotifs = synthetic.LoadedEncodeMotifs(pathToMotifs, pseudocountProb=pc) motifName1 = options.motifName1