def translate(word):
    return [{"A":0, "T":1, "G":2, "C":3}[i] for i in word]

def translateRaw(array):
    return "".join([{0:"A", 1:"T", 2:"G", 3:"C"}[i] for i in array])


scoreFolder = sys.argv[1]
upstreamList = sys.argv[2]
fullUpstreamList = sys.argv[3]


if not os.path.exists("temp"):
    os.mkdir("temp")
current = PatternFinder(6, "temp")
current.loadSequences(upstreamList)

full = PatternFinder(6, scoreFolder)
full.loadSequences(fullUpstreamList)

temp = PatternFinder(6, scoreFolder)



def makeTextShuffleControl(filename, minLen=10000):
    cdata = np.array([translate(i.replace("\n", "").replace("\r", "")) for i in open(filename).readlines() if len(i) > 10]).T

    controls = []
    repeat = 1 + minLen / len(cdata[0])
Example #2
0
    [latent, coeff] = scipy.sparse.linalg.eigsh(covM, numPCs)
    if verbose:
        print "Eigenvalues are:", latent
    return (np.transpose(coeff[:, ::-1]), latent[::-1])



def translate(word):
    return [{"A":0, "T":1, "G":2, "C":3}[i] for i in word]

def translateRaw(array):
    return "".join([{0:"A", 1:"T", 2:"G", 3:"C"}[i] for i in array])



a = PatternFinder(6, scoreFolder)
a.loadSequences(sequenceFile)
allSeqs = a.rawSequences

data = pd.read_csv(os.path.join(scoreFolder, "sortedBy/Best10000_sortBy_ScoreNew_8.csv"))

lef = data["Pos l"].values
rig = data["Pos r"].values
mask = (abs(rig - POSITION_RIGHT) < 4) * (abs(lef - POSITION_LEFT) < 4)
data = data[mask]
patLeftBest = data["Patt l"].values[0]
patRightBest = data["Patt r"].values[0]

data = data[:300]
assert len(data) > 90  # check that we have at least 90 unique patterns
Example #3
0
This is where all the the other constants are defined for this part only.
ALPHA is the slope of the score for mismatches, defined in the paper

BEST_SINGLE is the number of best n-mers to use for each side of the pattern.
Then all pairwise combinations of BEST_SINGLE x BEST_SINGLE 6-mers will be evaluated at
every pari of positions in the upstream

MAX_SCORE is the maximum score (offset + extension + mismatches) for the whole pattern
MAX_SCORE_SINGLE is the maximum score for scoring separate n-mers (used for selection of BEST_SINGLE only)

MAX_SUBS and MAX_SHIFT are maximum number of subs and maximum offset of a pattern
"""



a = PatternFinder(6, sys.argv[2])
# create a patternFinder object bound to folder provided as a second command line argument

a.loadSequences(sys.argv[1])

# if third argument is provided, we interpret it as a set of locations where to evaluate pattern
if len(sys.argv) >= 4:
    locations = [map(int, j.split("-")) for j in sys.argv[3].split(",")]
    print locations

# If not provided, the program will automatically select it
else:
    locations = None


def translate(word):
    return [{"A":0, "T":1, "G":2, "C":3}[i] for i in word]

def translateRaw(array):
    return "".join([{0:"A", 1:"T", 2:"G", 3:"C"}[i] for i in array])


scoreFolder = sys.argv[1]
upstreamList = sys.argv[2]
fullUpstreamList = sys.argv[3]


if not os.path.exists("temp"):
    os.mkdir("temp")
current = PatternFinder(6, "temp")
current.loadSequences(upstreamList)

full = PatternFinder(6, scoreFolder)
full.loadSequences(fullUpstreamList)

temp = PatternFinder(6, scoreFolder)



def makeTextShuffleControl(filename, minLen=10000):
    cdata = np.array([translate(i.replace("\n", "").replace("\r", "")) for i in open(filename).readlines() if len(i) > 10]).T

    controls = []
    repeat = 1 + minLen / len(cdata[0])