def load_index(self): try: # Try to load processed locations (faster) readfile = self.cplocfile if os.path.isfile(readfile): print("[STATUS] Loading processed masterlist: " + readfile) out = AUX.load_pickle_gzip(readfile) self.idict = out['loc'] self.ndict = out['names'] self.wdict = out['weights'] else: raise Exception("No such file: " + readfile) except: # Otherwise read the file and process it: print("[STATUS] Processing masterlist") self.ndict = {} self.idict = {} self.wdict = {} self.oldchr = '' with open(self.indexlocfile, 'r') as f: for line in f: self.process_index_line(line) # Process the aggregate: for key in self.idict.keys(): # Flatten: il = np.array(flatten_list(self.idict[key])) wl = np.array(flatten_list(self.wdict[key])) nl = np.array(flatten_list(self.ndict[key])) # Order lists according to bin #: order = np.argsort(il) self.idict[key] = il[order] self.wdict[key] = wl[order] self.ndict[key] = nl[order] # Save it as pickled file out = { 'names': self.ndict, 'loc': self.idict, 'weights': self.wdict } AUX.save_pickle_gzip(self.cplocfile, out) final_names = [] final_chr = [] print("[STATUS] Getting order of names:") for chrom in self.chrlist: # NOTE: Copy exactly the name sort from merge bins: nlist = list(np.unique(self.ndict[chrom])) final_names = final_names + nlist final_chr = final_chr + [chrom] * len(nlist) # Write out final names as readable TSV: print("[STATUS] Writing names list order to: " + self.cpnamfile) ndf = pd.DataFrame({ 'name': final_names, 'chr': final_chr, 'cls': list(np.arange(len(final_names)) + 1) }) ndf.to_csv(self.cpnamfile, sep='\t', index=False)
def write_collapsed_matrix(self, pref): collfile = pref + "_collapsed_csr.cp.gz" NS = len(self.states) if not os.path.exists(collfile): # To COO, reduced representation: self.Xcoo = coo_matrix(self.X) col = self.Xcoo.col / NS col = col.astype('int') Xnew = coo_matrix((self.Xcoo.data, (self.Xcoo.row, col)), (self.X.shape[0], int(self.X.shape[1] / NS))) Xcsr = csr_matrix(Xnew) print("[STATUS] Collapsed matrix to size: " + str(Xcsr.shape)) AUX.save_pickle_gzip(collfile, Xcsr) print("[STATUS] Done writing out collapsed CSR")
def process_chrom(self, chrom): print("[STATUS] Processing " + chrom) chrompref = self.out + "_" + chrom + self.re_mid if self.mergestates: chrompref = chrompref + "_merged" chrdata_file = chrompref + "_csr.cp.gz" chrattr_file = chrompref + "_attr.cp.gz" j = 0 namelist = [] for idnum in tqdm(self.pid): currid = self.ids[idnum] filepref = self.prefixes[idnum] + chrom + self.suffixes[idnum] maindir = self.dirs[idnum] self.verboseprint(currid + ": " + filepref) if self.intindex: # Get intersected matrix: [X, names] = self.get_mat_idlist(filepref, chrom, maindir) else: # Get full matrix for states [X, names] = self.get_mat(filepref, maindir) if type(names) != list: names = [names] # Concatenate: if j == 0: FULL = X namelist = names j = 1 else: FULL = hstack([FULL, X]) namelist = namelist + names # print("[STATUS] Current shape: " + str(FULL.shape)) # print("[STATUS] Current names: " + str(namelist)) # Print out dataset: attr = {'names': namelist} print("[STATUS] Writing chromosome " + chrom + " dataset to: " + chrdata_file) AUX.save_pickle_gzip(chrdata_file, FULL) AUX.save_pickle_gzip(chrattr_file, attr)
def concatenate_chrom(self): print("[STATUS] Concatenating all chromosomes") out_pref = self.out + self.re_mid + "_allchr" if self.mergestates: out_pref = out_pref + "_merged" out_attr = out_pref + "_attr.cp.gz" full_names = None self.X = None for chrom in tqdm(self.chrlist): chrompref = self.out + "_" + chrom + self.re_mid if self.mergestates: chrompref = chrompref + "_merged" chrdata_file = chrompref + "_csr.cp.gz" chrattr_file = chrompref + "_attr.cp.gz" # data_file = self.out + "_" + chrom + self.re_mid # attr_file = data_file + "_attr.cp.gz" if not os.path.isfile(chrdata_file): self.process_chrom(chrom) X_chr = csr_matrix(AUX.load_file_save_sparse(chrompref)) attr = AUX.load_pickle_gzip(chrattr_file) if full_names is None: full_names = attr['names'] if attr['names'] != full_names: raise ValueError("Not the same names!") if self.X is None: self.X = X_chr else: self.X = vstack([self.X, X_chr]) print("[STATUS] Writing final matrix") # Add indexes dictionary and save attributes: AUX.save_pickle_gzip(out_attr, attr) # Save as CSR sparse (both NPZ and CP) AUX.save_pickle_gzip(out_pref + "_csr.cp.gz", self.X) AUX.save_sparse_csr(out_pref + "_csr", self.X) if self.chromhmm and type(self.states) == list: self.write_collapsed_matrix(out_pref)
def save_cpfiles(self, X, names, main, attr): print("[STATUS] Saving to: " + main) out = {'names': names} AUX.save_pickle_gzip(main, X) AUX.save_pickle_gzip(attr, out)