def run(self): self.row_labels = Reader(self.query_path).get_headers() query_counts = self.make_query_counts() target_reader = Reader(self.target_path) target_headers_seqs = target_reader.get_data(tuples_only=True) r_values = [] for target_header, target in my_tqdm()(target_headers_seqs): r_values.append( self.compare_query_target(query_counts, target_header, target)) self.r_values2df(r_values) self.percentiles = self.calc_percentiles(query_counts) self.save()
def save(self, names=None): """Saves the counts appropriately based on current settings. There are four output methods for the counts: 1. Binary. This saves just the counts as a binary numpy array. 2. No labels. Saves in plain text, but without any labels. 3. Default names. If no names are provided, fasta headers will be used as labels. 4. Custom names. Provide a list of names if you want to label lncRNAs with your own names. Parameters ---------- names : [str] (default=None) Unique names for rows of the Dataframe. """ err_msg = 'You cannot label a binary file. Set only one of "binary" or "label" as True.' assert not (self.binary and self.label), err_msg assert self.outfile is not None, 'Please provide an outfile location.' if self.binary: np.save(self.outfile, self.counts) elif self.label: if names is None: names = Reader(self.infasta).get_headers() df = DataFrame(data=self.counts, index=names, columns=self.kmers) df.to_csv(self.outfile) else: np.savetxt(self.outfile, self.counts, delimiter=',', fmt='%1.6f')
def __init__( self, infasta=None, outfile=None, k=6, binary=True, mean=True, std=True, log2=Log2.post, leave=True, silent=False, label=False, alphabet="AGTC", ): self.infasta = infasta self.seqs = None if infasta is not None: self.seqs = Reader(infasta).get_seqs() self.outfile = outfile self.k = k self.binary = binary self.mean = mean if isinstance(mean, str): self.mean = np.load(mean) self.std = std if isinstance(std, str): self.std = np.load(std) self.log2 = log2 self.leave = leave self.silent = silent self.label = label self.counts = None self.alpha_len = len(alphabet) self.kmers = ["".join(i) for i in product(alphabet, repeat=k)] self.map = {k: i for k, i in zip(self.kmers, range(self.alpha_len**k))} if self.seqs is not None: if len(self.seqs) == 1 and self.std is True: err = ("You cannot standardize a single sequence. " "Please pass the path to an std. dev. array, " "or use raw counts by setting std=False.") raise ValueError(err) if not isinstance(self.log2, Log2): raise TypeError(f"log2 must be one of {list(Log2)}")
def __init__(self, infasta=None, outfile=None, k=6, binary=True, mean=True, std=True, log2=True, leave=True, silent=False, label=False): self.infasta = infasta self.seqs = None if infasta is not None: self.seqs = Reader(infasta).get_seqs() self.outfile = outfile self.k = k self.binary = binary self.mean = mean if isinstance(mean, str): self.mean = np.load(mean) self.std = std if isinstance(std, str): self.std = np.load(std) self.log2 = log2 self.leave = leave self.silent = silent self.label = label self.counts = None self.kmers = [''.join(i) for i in product('AGTC', repeat=k)] self.map = {k: i for k, i in zip(self.kmers, range(4**k))} if self.seqs is not None: if len(self.seqs) == 1 and self.std is True: err = ('You cannot standardize a single sequence. ' 'Please pass the path to an std. dev. array, ' 'or use raw counts by setting std=False.') raise ValueError(err)
default=multiprocessing.cpu_count() - 1) parser.add_argument('-w', type=int, help='Window for tile size', default=1000) parser.add_argument('-s', type=int, help='How many bp to slide tiles', default=100) args = parser.parse_args() ########################################################################### ########################################################################### #Path to known functional domains target_path = args.t target_head, target_seq = Reader(target_path).get_headers(), Reader( target_path).get_seqs() target_dict = dict(zip(target_head, target_seq)) BCD = np.load('./mamBCD4.mkv.npy') AE = np.load('./mamAE4.mkv.npy') ########################################################################### ########################################################################### ########################################################################### ''' Parallelize transcript computations ''' ########################################################################### with multiprocessing.Pool(args.n) as pool:
alphabet = [letter for letter in args.a] model = args.model if not model.endswith('/'): model +='/' kDir = model+f'{args.k}'+'/' modelName = model.split('/')[-2] # Check if file exists and open if so, else skip this iteration of the loop hmm = pickle.load(open(kDir+'hmm.mkv','rb')) # Explicitly determine k from the size of the log matrix and the size of the alphabet used to generate it k = int(log(len(hmm['E']['+'].keys()),len(args.a))) kmers = [''.join(p) for p in product(alphabet,repeat=k)] # generate k-mers target = Reader(args.db) targetSeqs,targetHeaders = target.get_seqs(),target.get_headers() targetMap = defaultdict(list) #Pool processes onto number of CPU cores specified by the user with pool.Pool(args.n) as multiN: jobs = multiN.starmap(hmmCalc,product(*[list(zip(targetHeaders,targetSeqs))])) dataDict = dict(jobs) #Check if no hits were found # if not all(v == None for v in dataDict.values()): dataFrames = pd.concat([df for df in dataDict.values() if not None]) dataFrames['Length'] = dataFrames['End'] - dataFrames['Start'] dataFrames = dataFrames[['Start','End','Length','kmerLLR','seqName','Sequence']] if not args.fasta:
flatCoords = list(flatCoords) finalCoords = [] for k, g in groupby(enumerate(flatCoords),lambda kv:kv[0]-kv[1]): finalCoords.append(list(map(itemgetter(1), g))) return finalCoords def nsmall(a, n): return np.partition(a, n)[n] k = 4 BCD = np.load('./mamBCD4.mkv.npy') AE = np.load('./mamAE4.mkv.npy') mm10Genes = Reader('gencodeGenesMm10.fa') mm10Seq = mm10Genes.get_seqs() mm10Head = mm10Genes.get_headers() w=200 s=20 seqMap = defaultdict(list) initModelMap = {'BCD':BCD,'AE':AE} for head,seq in tqdm(zip(mm10Head,mm10Seq),total=len(mm10Seq)): tiles = [seq[i:i+w] for i in range(0,len(seq)-w+1,s)] mapMm10BCD = np.array([classify(tile,k,initModelMap['BCD']) for tile in tiles]) mapMm10AE = np.array([classify(tile,k,initModelMap['AE']) for tile in tiles]) whereHitBCD = np.array(np.nonzero(mapMm10BCD < 0)) whereHitAE = np.array(np.nonzero(mapMm10AE < 0)) whereHit = np.unique(np.concatenate((whereHitBCD,whereHitAE),axis=None)) if whereHit.size > 0:
parser.add_argument('-n', type=int, help='Number of processors,default = number cpus avail', default=multiprocessing.cpu_count()-1) parser.add_argument('-w', type=int, help='Window for tile size', default=1000) parser.add_argument( '-s', type=int, help='How many bp to slide tiles', default=100) args = parser.parse_args() kmers = [''.join(p) for p in product('AGTC',repeat=args.k)] kmer_map = dict(zip(kmers,range(0,4**args.k))) ########################################################################### ########################################################################### #Path to known functional domains query_path = './queries/queries.fa' target_path = args.t target_head, target_seq = Reader( target_path).get_headers(), Reader(target_path).get_seqs() target_dict = dict(zip(target_head, target_seq)) queries = dict(zip(Reader(query_path).get_headers(), Reader(query_path).get_seqs())) ########################################################################### ########################################################################### mean_paths = [f for f in glob.iglob('./stats/*mean.npy')] std_paths = [f for f in glob.iglob('./stats/*std.npy')] means = {} for mean_path in mean_paths: means[basename(mean_path)] = np.load(mean_path) stds = {} for std_path in std_paths:
def __init__(self, infasta=None, outfasta=None, outnames=None): self.infasta = infasta if infasta is not None: self.data, self.names, self.seqs = Reader(infasta).get_data() self.outfasta = outfasta self.outnames = outnames
hits = pickle.load(open(args.hits,'rb')) df = pd.DataFrame.from_dict(hits,orient='index') query_dict = defaultdict(list) for row in df.iterrows(): query_dict = defaultdict(list) name = row[0] data = row[1:][0][0] for i in data: query_dict[i[1]].append(i[0]) hits[name] = query_dict.copy() query_dict.clear() parsedDf = pd.DataFrame.from_dict(hits,orient='index') queryFasta = Reader('./queries.fa') queryHeaders = queryFasta.get_headers() queryMap = dict(zip(list(range(13)),queryHeaders)) parsedDf.rename(columns=queryMap,inplace=True) queryCons = {} for query in parsedDf: seekrGenomicCoords = dSeekrCoords(parsedDf,args.w,args.s) with multiprocessing.Pool(args.n) as pool: ha = pool.starmap(getCons, product( *[[list(seekrGenomicCoords.items())],[mouseGTF])) merge = dict(ha) queryCons[query] = merge print(queryCons) 1/0
'-n', type=int, help= 'Number of CPU cores. Each job corresponds to a value of k, and the program scales well with multiprocessing', default=1) args = parser.parse_args() if __name__ == '__main__': # Read in specified values of k, and the alphabet kVals = [int(i) for i in args.k.split(',')] a = args.a.upper() #SEEKR fasta reader module F = Reader(args.fasta) fS = F.get_seqs() #Join sequences together using $ delimiter character fString = '$'.join(fS).upper() lenFString = sum([len(i) for i in fS]) # Need to figure out how to deal with very long fasta files (~ 2-3X the size of the transcriptome in mice) # if lenFString >= 2147483647: # fString='$'.join(fS[::10]).upper() #Split jobs onto processors and call kmers.pyx cython file with pool.Pool(args.n) as multiN: jobs = multiN.starmap(kmers.main, product(*[[fString], kVals, [a]])) dataDict = dict(jobs)
for j, cj in enumerate('ATCG'): conds[i, j] = kmers[ci + cj] / float(tot) return conds # Null Model genomeRawCounts = BasicCounter( '/Users/danielsprague/Downloads/gencode.vM21.transcripts.fa', k=k, mean=False, std=False, log2=False, alphabet='ATCG') genomeFa = Reader('/Users/danielsprague/Downloads/gencode.vM21.transcripts.fa') genomeSeqs = genomeFa.get_seqs() genomeSeqLens = [len(i) for i in genomeSeqs] genomeRawCounts.get_counts() unNormGenomeCounts = genomeRawCounts.counts.T * genomeSeqLens / 1000 genomeCounts = np.rint(unNormGenomeCounts.T) weightedAvgGenomeCounts = np.average(genomeCounts, weights=genomeSeqLens, axis=0) kmers = [''.join(p) for p in itertools.product('ATCG', repeat=k)] curr_kmers = dict(zip(kmers, weightedAvgGenomeCounts)) genome_avg = markov_chain(curr_kmers, k, 'ATCG') np.save(f'./genome{k}.mkv.npy', genome_avg)
parser = argparse.ArgumentParser() parser.add_argument("-k",type=int) parser.add_argument('--db',type=str,help='Path to fasta file containing training sequences') parser.add_argument('--prior',type=str,help='Path to binary .mkv file output from train.py (e.g. markovModels/D_null/2/hmm.mkv') parser.add_argument('-cf','--createfile',action='store_true',help='Create new file rather than overwrite') parser.add_argument('--its',type=int,help='Iterations to do, default=100',default=20) args = parser.parse_args() assert args.its > 0, 'Please provide an integer greater than or equal to 1' assert args.k > 0, 'Please provide an integer greater than or equal to 1' fa = Reader(args.db) seqs = '$'.join(fa.get_seqs()) model = args.prior k = args.k # Identify the location of any ambiguous nucleotides (N) O,oIdx,nBP = corefunctions.kmersWithAmbigIndex(seqs,k) # Load in train.py output hmm = pickle.load(open(args.prior,'rb')) ''' A - transition matrix (dictionary) E - emission matrix (dictionary) pi - initial state probabilities (always 50/50) states - list of states (query,null)