def translate(word):
    return [{"A":0, "T":1, "G":2, "C":3}[i] for i in word]

def translateRaw(array):
    return "".join([{0:"A", 1:"T", 2:"G", 3:"C"}[i] for i in array])


scoreFolder = sys.argv[1]
upstreamList = sys.argv[2]
fullUpstreamList = sys.argv[3]


if not os.path.exists("temp"):
    os.mkdir("temp")
current = PatternFinder(6, "temp")
current.loadSequences(upstreamList)

full = PatternFinder(6, scoreFolder)
full.loadSequences(fullUpstreamList)

temp = PatternFinder(6, scoreFolder)



def makeTextShuffleControl(filename, minLen=10000):
    cdata = np.array([translate(i.replace("\n", "").replace("\r", "")) for i in open(filename).readlines() if len(i) > 10]).T

    controls = []
    repeat = 1 + minLen / len(cdata[0])
Beispiel #2
0
    [latent, coeff] = scipy.sparse.linalg.eigsh(covM, numPCs)
    if verbose:
        print "Eigenvalues are:", latent
    return (np.transpose(coeff[:, ::-1]), latent[::-1])



def translate(word):
    return [{"A":0, "T":1, "G":2, "C":3}[i] for i in word]

def translateRaw(array):
    return "".join([{0:"A", 1:"T", 2:"G", 3:"C"}[i] for i in array])



a = PatternFinder(6, scoreFolder)
a.loadSequences(sequenceFile)
allSeqs = a.rawSequences

data = pd.read_csv(os.path.join(scoreFolder, "sortedBy/Best10000_sortBy_ScoreNew_8.csv"))

lef = data["Pos l"].values
rig = data["Pos r"].values
mask = (abs(rig - POSITION_RIGHT) < 4) * (abs(lef - POSITION_LEFT) < 4)
data = data[mask]
patLeftBest = data["Patt l"].values[0]
patRightBest = data["Patt r"].values[0]

data = data[:300]
assert len(data) > 90  # check that we have at least 90 unique patterns
This is where all the the other constants are defined for this part only.
ALPHA is the slope of the score for mismatches, defined in the paper

BEST_SINGLE is the number of best n-mers to use for each side of the pattern.
Then all pairwise combinations of BEST_SINGLE x BEST_SINGLE 6-mers will be evaluated at
every pari of positions in the upstream

MAX_SCORE is the maximum score (offset + extension + mismatches) for the whole pattern
MAX_SCORE_SINGLE is the maximum score for scoring separate n-mers (used for selection of BEST_SINGLE only)

MAX_SUBS and MAX_SHIFT are maximum number of subs and maximum offset of a pattern
"""



a = PatternFinder(6, sys.argv[2])
# create a patternFinder object bound to folder provided as a second command line argument

a.loadSequences(sys.argv[1])

# if third argument is provided, we interpret it as a set of locations where to evaluate pattern
if len(sys.argv) >= 4:
    locations = [map(int, j.split("-")) for j in sys.argv[3].split(",")]
    print locations

# If not provided, the program will automatically select it
else:
    locations = None


def translate(word):
    return [{"A":0, "T":1, "G":2, "C":3}[i] for i in word]

def translateRaw(array):
    return "".join([{0:"A", 1:"T", 2:"G", 3:"C"}[i] for i in array])


scoreFolder = sys.argv[1]
upstreamList = sys.argv[2]
fullUpstreamList = sys.argv[3]


if not os.path.exists("temp"):
    os.mkdir("temp")
current = PatternFinder(6, "temp")
current.loadSequences(upstreamList)

full = PatternFinder(6, scoreFolder)
full.loadSequences(fullUpstreamList)

temp = PatternFinder(6, scoreFolder)



def makeTextShuffleControl(filename, minLen=10000):
    cdata = np.array([translate(i.replace("\n", "").replace("\r", "")) for i in open(filename).readlines() if len(i) > 10]).T

    controls = []
    repeat = 1 + minLen / len(cdata[0])