def merge_matrices(self, binary=False): try: print("[STATUS] Trying to load cp gz: " + self.h3file) self.H = AUX.load_pickle_gzip(self.h3file) except: print("[STATUS] Trying to load sparse csr: " + self.h3file) # NOTE: Won't work unless strip off .cp.gz: self.H = load_sparse_csr(self.h3file) # Load attributes: print("[STATUS] Load attributes in cp gz: " + self.h3attr) self.hattr = AUX.load_pickle_gzip(self.h3attr) self.hnames = self.hattr['names'] tmpnames = [re.sub("^.*=BSS", "BSS", n) for n in self.hnames] self.hnames = [re.sub("_.*$", "", n).strip("\n") for n in tmpnames] print(self.hnames[0:10]) # NOTE: Set 0 if lower than cutoff=2 cutoff = 2.0 if self.X.shape == self.H.shape: # Make sure that rows etc match: hord = np.array([np.where(np.array(self.hnames) == n)[0][0] for n in self.names]) print("[STATUS] Reordering histone matrix") print(self.names[0:10]) print(self.hnames[0:10]) print(hord[0:10]) # Turn hX into binary matrix, thresholded at 2: self.Hord = 1 * (self.H[:, hord] >= cutoff) del(self.H) Xtmp = self.X.multiply(self.Hord) print(str(self.X.nnz) + " (ann) to " + str(Xtmp.nnz)) print(str(self.Hord.nnz) + " (histone) to " + str(Xtmp.nnz)) self.X = Xtmp del(Xtmp) else: Hcoo = self.H.tocoo() print("[STATUS] Reducing to values above " + str(cutoff)) ind = np.where(Hcoo.data >= cutoff)[0] Hred = coo_matrix((Hcoo.data[ind], (Hcoo.row[ind], Hcoo.col[ind])), shape=self.H.shape) print(str(Hcoo.nnz) + " to " + str(Hred.nnz)) del(self.H, Hcoo) self.H = Hred.tocsc() del(Hred) # Arrays for new coo matrix: Xcsc = self.X.tocsc() for i in range(len(self.hnames)): nam = self.hnames[i] print(i, nam) # If name in list: mtid = [j for j, v in enumerate(self.names_split) if v == nam] print(mtid) if len(mtid) > 0: mtid = np.array(mtid) vec = self.H[:, i] # Turn to vec print(vec.shape) Xcsc[:, mtid] = Xcsc[:, mtid].multiply(vec) print(str(self.X.nnz) + " to " + str(Xcsc.nnz)) self.X = Xcsc.tocsr() del(Xcsc, self.H)
def load_cpfiles(self, main, attr): if os.path.isfile(main): self.verboseprint("[STATUS] Trying to load cp: " + main) X = AUX.load_pickle_gzip(main) out = AUX.load_pickle_gzip(attr) names = out['names'] self.verboseprint("[STATUS] Loaded cp file successfully") else: raise Exception("No such file: " + main) return ([X, names])
def load_data(self): try: print("[STATUS] Trying to load cp gz: " + self.filename) self.X = AUX.load_pickle_gzip(self.filename) except: print("[STATUS] Trying to load sparse csr: " + self.filename) self.X = load_sparse_csr(self.filename) # Load attributes: print("[STATUS] Load attributes in cp gz: " + self.attrfile) self.attr = AUX.load_pickle_gzip(self.attrfile) self.names = self.attr['names'] self.names_split = [n.split("_")[0] for n in self.names] try: self.states_split = [n.split("_")[1] for n in self.names] except: print("[STATUS] Do not need to merge states because already merged") self.states_split = [] self.mergestates = False # Collapse matrix if necessary: if self.mergestates: self.collapse_matrix() # Merge after collapse/not: if self.h3file is not None: self.merge_matrices(binary=self.binarymat) # Get plotting order: self.nidx = [] for n in self.names_split: self.nidx.append([i for i, v in enumerate(self.order) if v == n][0]) print("Saving rownames...") AUX.write_list(self.outprefix + "_names.tsv", self.names) print("NNAMES: " + str(len(self.names))) # Collapse data: self.allbin = (self.X.shape[0] > 15000000) print("[STATUS] " + "Loaded data with shape: " + str(self.X.shape)) if self.keepfile is None: self.marg = np.sum(self.X, axis=1) # Locate points above cutoff self.keptix = np.where(self.marg > self.cutoff)[0] # NOTE: For future versions, merge peaks, not straighforward now # Keep locations for original peaks print("[STATUS] " + "Keeping " + str(len(self.keptix)) + " peaks with evidence > " + str(self.cutoff)) else: print("[STATUS] Loading indices from: " + self.keepfile) self.keptix = AUX.load_pickle_gzip(self.keepfile) print("[STATUS] Keeping " + str(len(self.keptix)) + " peaks") self.X = self.X[self.keptix, ] * 1.0 self.M = self.X.shape[0] self.N = self.X.shape[1]
def load_index(self): try: # Try to load processed locations (faster) readfile = self.cplocfile if os.path.isfile(readfile): print("[STATUS] Loading processed masterlist: " + readfile) out = AUX.load_pickle_gzip(readfile) self.idict = out['loc'] self.ndict = out['names'] self.wdict = out['weights'] else: raise Exception("No such file: " + readfile) except: # Otherwise read the file and process it: print("[STATUS] Processing masterlist") self.ndict = {} self.idict = {} self.wdict = {} self.oldchr = '' with open(self.indexlocfile, 'r') as f: for line in f: self.process_index_line(line) # Process the aggregate: for key in self.idict.keys(): # Flatten: il = np.array(flatten_list(self.idict[key])) wl = np.array(flatten_list(self.wdict[key])) nl = np.array(flatten_list(self.ndict[key])) # Order lists according to bin #: order = np.argsort(il) self.idict[key] = il[order] self.wdict[key] = wl[order] self.ndict[key] = nl[order] # Save it as pickled file out = { 'names': self.ndict, 'loc': self.idict, 'weights': self.wdict } AUX.save_pickle_gzip(self.cplocfile, out) final_names = [] final_chr = [] print("[STATUS] Getting order of names:") for chrom in self.chrlist: # NOTE: Copy exactly the name sort from merge bins: nlist = list(np.unique(self.ndict[chrom])) final_names = final_names + nlist final_chr = final_chr + [chrom] * len(nlist) # Write out final names as readable TSV: print("[STATUS] Writing names list order to: " + self.cpnamfile) ndf = pd.DataFrame({ 'name': final_names, 'chr': final_chr, 'cls': list(np.arange(len(final_names)) + 1) }) ndf.to_csv(self.cpnamfile, sep='\t', index=False)
def concatenate_chrom(self): print("[STATUS] Concatenating all chromosomes") out_pref = self.out + self.re_mid + "_allchr" if self.mergestates: out_pref = out_pref + "_merged" out_attr = out_pref + "_attr.cp.gz" full_names = None self.X = None for chrom in tqdm(self.chrlist): chrompref = self.out + "_" + chrom + self.re_mid if self.mergestates: chrompref = chrompref + "_merged" chrdata_file = chrompref + "_csr.cp.gz" chrattr_file = chrompref + "_attr.cp.gz" # data_file = self.out + "_" + chrom + self.re_mid # attr_file = data_file + "_attr.cp.gz" if not os.path.isfile(chrdata_file): self.process_chrom(chrom) X_chr = csr_matrix(AUX.load_file_save_sparse(chrompref)) attr = AUX.load_pickle_gzip(chrattr_file) if full_names is None: full_names = attr['names'] if attr['names'] != full_names: raise ValueError("Not the same names!") if self.X is None: self.X = X_chr else: self.X = vstack([self.X, X_chr]) print("[STATUS] Writing final matrix") # Add indexes dictionary and save attributes: AUX.save_pickle_gzip(out_attr, attr) # Save as CSR sparse (both NPZ and CP) AUX.save_pickle_gzip(out_pref + "_csr.cp.gz", self.X) AUX.save_sparse_csr(out_pref + "_csr", self.X) if self.chromhmm and type(self.states) == list: self.write_collapsed_matrix(out_pref)