Beispiel #1
0
model = args.model
if not model.endswith('/'):
    model +='/'

kDir = model+f'{args.k}'+'/'
modelName = model.split('/')[-2]
# Check if file exists and open if so, else skip this iteration of the loop

hmm = pickle.load(open(kDir+'hmm.mkv','rb'))


# Explicitly determine k from the size of the log matrix and the size of the alphabet used to generate it
k = int(log(len(hmm['E']['+'].keys()),len(args.a)))
kmers = [''.join(p) for p in product(alphabet,repeat=k)] # generate k-mers
target = Reader(args.db)
targetSeqs,targetHeaders = target.get_seqs(),target.get_headers()
targetMap = defaultdict(list)


#Pool processes onto number of CPU cores specified by the user
with pool.Pool(args.n) as multiN:
    jobs = multiN.starmap(hmmCalc,product(*[list(zip(targetHeaders,targetSeqs))]))
    dataDict = dict(jobs)
#Check if no hits were found
# if not all(v == None for v in dataDict.values()):

dataFrames = pd.concat([df for df in dataDict.values() if not None])
dataFrames['Length'] = dataFrames['End'] - dataFrames['Start']
dataFrames = dataFrames[['Start','End','Length','kmerLLR','seqName','Sequence']]
if not args.fasta:
    dataFrames = dataFrames[['Start','End','Length','kmerLLR','seqName']]
Beispiel #2
0
    finalCoords = []
    for k, g in groupby(enumerate(flatCoords),lambda kv:kv[0]-kv[1]):
        finalCoords.append(list(map(itemgetter(1), g)))
    return finalCoords

def nsmall(a, n):
    return np.partition(a, n)[n]


k = 4

BCD = np.load('./mamBCD4.mkv.npy')
AE = np.load('./mamAE4.mkv.npy')

mm10Genes = Reader('gencodeGenesMm10.fa')
mm10Seq = mm10Genes.get_seqs()
mm10Head = mm10Genes.get_headers()
w=200
s=20
seqMap = defaultdict(list)

initModelMap = {'BCD':BCD,'AE':AE}
for head,seq in tqdm(zip(mm10Head,mm10Seq),total=len(mm10Seq)):
    tiles = [seq[i:i+w] for i in range(0,len(seq)-w+1,s)]
    mapMm10BCD = np.array([classify(tile,k,initModelMap['BCD']) for tile in tiles])
    mapMm10AE = np.array([classify(tile,k,initModelMap['AE']) for tile in tiles])
    whereHitBCD = np.array(np.nonzero(mapMm10BCD < 0))
    whereHitAE = np.array(np.nonzero(mapMm10AE < 0))
    whereHit = np.unique(np.concatenate((whereHitBCD,whereHitAE),axis=None))
    if whereHit.size > 0:
        coords = setCoords(whereHit)
Beispiel #3
0
                conds[i, j] = kmers[ci + cj] / float(tot)


return conds

# Null Model
genomeRawCounts = BasicCounter(
    '/Users/danielsprague/Downloads/gencode.vM21.transcripts.fa',
    k=k,
    mean=False,
    std=False,
    log2=False,
    alphabet='ATCG')
genomeFa = Reader('/Users/danielsprague/Downloads/gencode.vM21.transcripts.fa')
genomeSeqs = genomeFa.get_seqs()
genomeSeqLens = [len(i) for i in genomeSeqs]
genomeRawCounts.get_counts()
unNormGenomeCounts = genomeRawCounts.counts.T * genomeSeqLens / 1000
genomeCounts = np.rint(unNormGenomeCounts.T)
weightedAvgGenomeCounts = np.average(genomeCounts,
                                     weights=genomeSeqLens,
                                     axis=0)

kmers = [''.join(p) for p in itertools.product('ATCG', repeat=k)]
curr_kmers = dict(zip(kmers, weightedAvgGenomeCounts))
genome_avg = markov_chain(curr_kmers, k, 'ATCG')

np.save(f'./genome{k}.mkv.npy', genome_avg)

# initModel
Beispiel #4
0
    type=int,
    help=
    'Number of CPU cores. Each job corresponds to a value of k, and the program scales well with multiprocessing',
    default=1)

args = parser.parse_args()

if __name__ == '__main__':

    # Read in specified values of k, and the alphabet
    kVals = [int(i) for i in args.k.split(',')]
    a = args.a.upper()

    #SEEKR fasta reader module
    F = Reader(args.fasta)
    fS = F.get_seqs()

    #Join sequences together using $ delimiter character
    fString = '$'.join(fS).upper()
    lenFString = sum([len(i) for i in fS])

    # Need to figure out how to deal with very long fasta files (~ 2-3X the size of the transcriptome in mice)
    # if lenFString >= 2147483647:
    #     fString='$'.join(fS[::10]).upper()

    #Split jobs onto processors and call kmers.pyx cython file
    with pool.Pool(args.n) as multiN:
        jobs = multiN.starmap(kmers.main, product(*[[fString], kVals, [a]]))
        dataDict = dict(jobs)

    #Save data
Beispiel #5
0
parser = argparse.ArgumentParser()
parser.add_argument("-k",type=int)
parser.add_argument('--db',type=str,help='Path to fasta file containing training sequences')
parser.add_argument('--prior',type=str,help='Path to binary .mkv file output from train.py (e.g. markovModels/D_null/2/hmm.mkv')
parser.add_argument('-cf','--createfile',action='store_true',help='Create new file rather than overwrite')
parser.add_argument('--its',type=int,help='Iterations to do, default=100',default=20)

args = parser.parse_args()

assert args.its > 0, 'Please provide an integer greater than or equal to 1'
assert args.k > 0, 'Please provide an integer greater than or equal to 1'



fa = Reader(args.db)
seqs = '$'.join(fa.get_seqs())
model = args.prior

k = args.k

# Identify the location of any ambiguous nucleotides (N)
O,oIdx,nBP = corefunctions.kmersWithAmbigIndex(seqs,k)
# Load in train.py output 
hmm = pickle.load(open(args.prior,'rb'))

'''
A - transition matrix (dictionary)
E - emission matrix (dictionary)
pi - initial state probabilities (always 50/50)
states - list of states (query,null)
'''