Beispiel #1
0
    for k, g in groupby(enumerate(flatCoords),lambda kv:kv[0]-kv[1]):
        finalCoords.append(list(map(itemgetter(1), g)))
    return finalCoords

def nsmall(a, n):
    return np.partition(a, n)[n]


k = 4

BCD = np.load('./mamBCD4.mkv.npy')
AE = np.load('./mamAE4.mkv.npy')

mm10Genes = Reader('gencodeGenesMm10.fa')
mm10Seq = mm10Genes.get_seqs()
mm10Head = mm10Genes.get_headers()
w=200
s=20
seqMap = defaultdict(list)

initModelMap = {'BCD':BCD,'AE':AE}
for head,seq in tqdm(zip(mm10Head,mm10Seq),total=len(mm10Seq)):
    tiles = [seq[i:i+w] for i in range(0,len(seq)-w+1,s)]
    mapMm10BCD = np.array([classify(tile,k,initModelMap['BCD']) for tile in tiles])
    mapMm10AE = np.array([classify(tile,k,initModelMap['AE']) for tile in tiles])
    whereHitBCD = np.array(np.nonzero(mapMm10BCD < 0))
    whereHitAE = np.array(np.nonzero(mapMm10AE < 0))
    whereHit = np.unique(np.concatenate((whereHitBCD,whereHitAE),axis=None))
    if whereHit.size > 0:
        coords = setCoords(whereHit)
        for consecCoords in coords:
Beispiel #2
0
model = args.model
if not model.endswith('/'):
    model +='/'

kDir = model+f'{args.k}'+'/'
modelName = model.split('/')[-2]
# Check if file exists and open if so, else skip this iteration of the loop

hmm = pickle.load(open(kDir+'hmm.mkv','rb'))


# Explicitly determine k from the size of the log matrix and the size of the alphabet used to generate it
k = int(log(len(hmm['E']['+'].keys()),len(args.a)))
kmers = [''.join(p) for p in product(alphabet,repeat=k)] # generate k-mers
target = Reader(args.db)
targetSeqs,targetHeaders = target.get_seqs(),target.get_headers()
targetMap = defaultdict(list)


#Pool processes onto number of CPU cores specified by the user
with pool.Pool(args.n) as multiN:
    jobs = multiN.starmap(hmmCalc,product(*[list(zip(targetHeaders,targetSeqs))]))
    dataDict = dict(jobs)
#Check if no hits were found
# if not all(v == None for v in dataDict.values()):

dataFrames = pd.concat([df for df in dataDict.values() if not None])
dataFrames['Length'] = dataFrames['End'] - dataFrames['Start']
dataFrames = dataFrames[['Start','End','Length','kmerLLR','seqName','Sequence']]
if not args.fasta:
    dataFrames = dataFrames[['Start','End','Length','kmerLLR','seqName']]
Beispiel #3
0
df = pd.DataFrame.from_dict(hits,orient='index')
query_dict = defaultdict(list)
for row in df.iterrows():
    query_dict = defaultdict(list)
    name = row[0]
    data = row[1:][0][0]
    for i in data:
        query_dict[i[1]].append(i[0])
    hits[name] = query_dict.copy()
    query_dict.clear()

parsedDf = pd.DataFrame.from_dict(hits,orient='index')

queryFasta = Reader('./queries.fa')
queryHeaders = queryFasta.get_headers()

queryMap = dict(zip(list(range(13)),queryHeaders))
parsedDf.rename(columns=queryMap,inplace=True)
queryCons = {}

for query in parsedDf:
    seekrGenomicCoords = dSeekrCoords(parsedDf,args.w,args.s)
    with multiprocessing.Pool(args.n) as pool:
        ha = pool.starmap(getCons, product(
            *[[list(seekrGenomicCoords.items())],[mouseGTF]))
        merge = dict(ha)
    queryCons[query] = merge
    print(queryCons)
    1/0