def merge_matrices(self, binary=False):
     try:
         print("[STATUS] Trying to load cp gz: " + self.h3file)
         self.H = AUX.load_pickle_gzip(self.h3file)
     except:
         print("[STATUS] Trying to load sparse csr: " + self.h3file)
         # NOTE: Won't work unless strip off .cp.gz:
         self.H = load_sparse_csr(self.h3file)
     # Load attributes:
     print("[STATUS] Load attributes in cp gz: " + self.h3attr)
     self.hattr = AUX.load_pickle_gzip(self.h3attr)
     self.hnames = self.hattr['names']
     tmpnames = [re.sub("^.*=BSS", "BSS", n) for n in self.hnames]
     self.hnames = [re.sub("_.*$", "", n).strip("\n") for n in tmpnames]
     print(self.hnames[0:10])
     # NOTE: Set 0 if lower than cutoff=2
     cutoff = 2.0
     if self.X.shape == self.H.shape:
         # Make sure that rows etc match:
         hord = np.array([np.where(np.array(self.hnames) == n)[0][0]
                          for n in self.names])
         print("[STATUS] Reordering histone matrix")
         print(self.names[0:10])
         print(self.hnames[0:10])
         print(hord[0:10])
         # Turn hX into binary matrix, thresholded at 2:
         self.Hord = 1 * (self.H[:, hord] >= cutoff)
         del(self.H)
         Xtmp = self.X.multiply(self.Hord)
         print(str(self.X.nnz) + " (ann) to " + str(Xtmp.nnz))
         print(str(self.Hord.nnz) + " (histone) to " + str(Xtmp.nnz))
         self.X = Xtmp
         del(Xtmp)
     else:
         Hcoo = self.H.tocoo()
         print("[STATUS] Reducing to values above " + str(cutoff))
         ind = np.where(Hcoo.data >= cutoff)[0]
         Hred = coo_matrix((Hcoo.data[ind], (Hcoo.row[ind], Hcoo.col[ind])),
                           shape=self.H.shape)
         print(str(Hcoo.nnz) + " to " + str(Hred.nnz))
         del(self.H, Hcoo)
         self.H = Hred.tocsc()
         del(Hred)
         # Arrays for new coo matrix:
         Xcsc = self.X.tocsc()
         for i in range(len(self.hnames)):
             nam = self.hnames[i]
             print(i, nam)
             # If name in list:
             mtid = [j for j, v in enumerate(self.names_split)
                     if v == nam]
             print(mtid)
             if len(mtid) > 0:
                 mtid = np.array(mtid)
                 vec = self.H[:, i]  # Turn to vec
                 print(vec.shape)
             Xcsc[:, mtid] = Xcsc[:, mtid].multiply(vec)
         print(str(self.X.nnz) + " to " + str(Xcsc.nnz))
         self.X = Xcsc.tocsr()
         del(Xcsc, self.H)
Exemple #2
0
 def load_cpfiles(self, main, attr):
     if os.path.isfile(main):
         self.verboseprint("[STATUS] Trying to load cp: " + main)
         X = AUX.load_pickle_gzip(main)
         out = AUX.load_pickle_gzip(attr)
         names = out['names']
         self.verboseprint("[STATUS] Loaded cp file successfully")
     else:
         raise Exception("No such file: " + main)
     return ([X, names])
 def load_data(self):
     try:
         print("[STATUS] Trying to load cp gz: " + self.filename)
         self.X = AUX.load_pickle_gzip(self.filename)
     except:
         print("[STATUS] Trying to load sparse csr: " + self.filename)
         self.X = load_sparse_csr(self.filename)
     # Load attributes:
     print("[STATUS] Load attributes in cp gz: " + self.attrfile)
     self.attr = AUX.load_pickle_gzip(self.attrfile)
     self.names = self.attr['names']
     self.names_split = [n.split("_")[0] for n in self.names]
     try:
         self.states_split = [n.split("_")[1] for n in self.names]
     except:
         print("[STATUS] Do not need to merge states because already merged")
         self.states_split = []
         self.mergestates = False
     # Collapse matrix if necessary:
     if self.mergestates:
         self.collapse_matrix()
     # Merge after collapse/not:
     if self.h3file is not None:
         self.merge_matrices(binary=self.binarymat)
     # Get plotting order:
     self.nidx = []
     for n in self.names_split:
         self.nidx.append([i for i, v in enumerate(self.order) if v == n][0])
     print("Saving rownames...")
     AUX.write_list(self.outprefix + "_names.tsv", self.names)
     print("NNAMES: " + str(len(self.names)))
     # Collapse data:
     self.allbin = (self.X.shape[0] > 15000000)
     print("[STATUS] " + "Loaded data with shape: " + str(self.X.shape))
     if self.keepfile is None:
         self.marg = np.sum(self.X, axis=1)
         # Locate points above cutoff
         self.keptix = np.where(self.marg > self.cutoff)[0]
         # NOTE: For future versions, merge peaks, not straighforward now
         # Keep locations for original peaks
         print("[STATUS] " + "Keeping " + str(len(self.keptix)) +
               " peaks with evidence > " + str(self.cutoff))
     else:
         print("[STATUS] Loading indices from: " + self.keepfile)
         self.keptix = AUX.load_pickle_gzip(self.keepfile)
         print("[STATUS] Keeping " + str(len(self.keptix)) + " peaks")
     self.X = self.X[self.keptix, ] * 1.0
     self.M = self.X.shape[0]
     self.N = self.X.shape[1]
Exemple #4
0
 def load_index(self):
     try:
         # Try to load processed locations (faster)
         readfile = self.cplocfile
         if os.path.isfile(readfile):
             print("[STATUS] Loading processed masterlist: " + readfile)
             out = AUX.load_pickle_gzip(readfile)
             self.idict = out['loc']
             self.ndict = out['names']
             self.wdict = out['weights']
         else:
             raise Exception("No such file: " + readfile)
     except:
         # Otherwise read the file and process it:
         print("[STATUS] Processing masterlist")
         self.ndict = {}
         self.idict = {}
         self.wdict = {}
         self.oldchr = ''
         with open(self.indexlocfile, 'r') as f:
             for line in f:
                 self.process_index_line(line)
         # Process the aggregate:
         for key in self.idict.keys():
             # Flatten:
             il = np.array(flatten_list(self.idict[key]))
             wl = np.array(flatten_list(self.wdict[key]))
             nl = np.array(flatten_list(self.ndict[key]))
             # Order lists according to bin #:
             order = np.argsort(il)
             self.idict[key] = il[order]
             self.wdict[key] = wl[order]
             self.ndict[key] = nl[order]
         # Save it as pickled file
         out = {
             'names': self.ndict,
             'loc': self.idict,
             'weights': self.wdict
         }
         AUX.save_pickle_gzip(self.cplocfile, out)
         final_names = []
         final_chr = []
         print("[STATUS] Getting order of names:")
         for chrom in self.chrlist:
             # NOTE: Copy exactly the name sort from merge bins:
             nlist = list(np.unique(self.ndict[chrom]))
             final_names = final_names + nlist
             final_chr = final_chr + [chrom] * len(nlist)
         # Write out final names as readable TSV:
         print("[STATUS] Writing names list order to: " + self.cpnamfile)
         ndf = pd.DataFrame({
             'name': final_names,
             'chr': final_chr,
             'cls': list(np.arange(len(final_names)) + 1)
         })
         ndf.to_csv(self.cpnamfile, sep='\t', index=False)
Exemple #5
0
 def concatenate_chrom(self):
     print("[STATUS] Concatenating all chromosomes")
     out_pref = self.out + self.re_mid + "_allchr"
     if self.mergestates:
         out_pref = out_pref + "_merged"
     out_attr = out_pref + "_attr.cp.gz"
     full_names = None
     self.X = None
     for chrom in tqdm(self.chrlist):
         chrompref = self.out + "_" + chrom + self.re_mid
         if self.mergestates:
             chrompref = chrompref + "_merged"
         chrdata_file = chrompref + "_csr.cp.gz"
         chrattr_file = chrompref + "_attr.cp.gz"
         # data_file = self.out + "_" + chrom + self.re_mid
         # attr_file = data_file + "_attr.cp.gz"
         if not os.path.isfile(chrdata_file):
             self.process_chrom(chrom)
         X_chr = csr_matrix(AUX.load_file_save_sparse(chrompref))
         attr = AUX.load_pickle_gzip(chrattr_file)
         if full_names is None:
             full_names = attr['names']
         if attr['names'] != full_names:
             raise ValueError("Not the same names!")
         if self.X is None:
             self.X = X_chr
         else:
             self.X = vstack([self.X, X_chr])
     print("[STATUS] Writing final matrix")
     # Add indexes dictionary and save attributes:
     AUX.save_pickle_gzip(out_attr, attr)
     # Save as CSR sparse (both NPZ and CP)
     AUX.save_pickle_gzip(out_pref + "_csr.cp.gz", self.X)
     AUX.save_sparse_csr(out_pref + "_csr", self.X)
     if self.chromhmm and type(self.states) == list:
         self.write_collapsed_matrix(out_pref)