def translate(word): return [{"A":0, "T":1, "G":2, "C":3}[i] for i in word] def translateRaw(array): return "".join([{0:"A", 1:"T", 2:"G", 3:"C"}[i] for i in array]) scoreFolder = sys.argv[1] upstreamList = sys.argv[2] fullUpstreamList = sys.argv[3] if not os.path.exists("temp"): os.mkdir("temp") current = PatternFinder(6, "temp") current.loadSequences(upstreamList) full = PatternFinder(6, scoreFolder) full.loadSequences(fullUpstreamList) temp = PatternFinder(6, scoreFolder) def makeTextShuffleControl(filename, minLen=10000): cdata = np.array([translate(i.replace("\n", "").replace("\r", "")) for i in open(filename).readlines() if len(i) > 10]).T controls = [] repeat = 1 + minLen / len(cdata[0]) for _ in xrange(repeat):
if verbose: print "Eigenvalues are:", latent return (np.transpose(coeff[:, ::-1]), latent[::-1]) def translate(word): return [{"A":0, "T":1, "G":2, "C":3}[i] for i in word] def translateRaw(array): return "".join([{0:"A", 1:"T", 2:"G", 3:"C"}[i] for i in array]) a = PatternFinder(6, scoreFolder) a.loadSequences(sequenceFile) allSeqs = a.rawSequences data = pd.read_csv(os.path.join(scoreFolder, "sortedBy/Best10000_sortBy_ScoreNew_8.csv")) lef = data["Pos l"].values rig = data["Pos r"].values mask = (abs(rig - POSITION_RIGHT) < 4) * (abs(lef - POSITION_LEFT) < 4) data = data[mask] patLeftBest = data["Patt l"].values[0] patRightBest = data["Patt r"].values[0] data = data[:300] assert len(data) > 90 # check that we have at least 90 unique patterns allPats = []
BEST_SINGLE is the number of best n-mers to use for each side of the pattern. Then all pairwise combinations of BEST_SINGLE x BEST_SINGLE 6-mers will be evaluated at every pari of positions in the upstream MAX_SCORE is the maximum score (offset + extension + mismatches) for the whole pattern MAX_SCORE_SINGLE is the maximum score for scoring separate n-mers (used for selection of BEST_SINGLE only) MAX_SUBS and MAX_SHIFT are maximum number of subs and maximum offset of a pattern """ a = PatternFinder(6, sys.argv[2]) # create a patternFinder object bound to folder provided as a second command line argument a.loadSequences(sys.argv[1]) # if third argument is provided, we interpret it as a set of locations where to evaluate pattern if len(sys.argv) >= 4: locations = [map(int, j.split("-")) for j in sys.argv[3].split(",")] print locations # If not provided, the program will automatically select it else: locations = None a.scorePatternsThroughBest( bestNum=BEST_SINGLE, maxScore=MAX_SCORE,