Beispiel #1
0
 def load_index(self):
     try:
         # Try to load processed locations (faster)
         readfile = self.cplocfile
         if os.path.isfile(readfile):
             print("[STATUS] Loading processed masterlist: " + readfile)
             out = AUX.load_pickle_gzip(readfile)
             self.idict = out['loc']
             self.ndict = out['names']
             self.wdict = out['weights']
         else:
             raise Exception("No such file: " + readfile)
     except:
         # Otherwise read the file and process it:
         print("[STATUS] Processing masterlist")
         self.ndict = {}
         self.idict = {}
         self.wdict = {}
         self.oldchr = ''
         with open(self.indexlocfile, 'r') as f:
             for line in f:
                 self.process_index_line(line)
         # Process the aggregate:
         for key in self.idict.keys():
             # Flatten:
             il = np.array(flatten_list(self.idict[key]))
             wl = np.array(flatten_list(self.wdict[key]))
             nl = np.array(flatten_list(self.ndict[key]))
             # Order lists according to bin #:
             order = np.argsort(il)
             self.idict[key] = il[order]
             self.wdict[key] = wl[order]
             self.ndict[key] = nl[order]
         # Save it as pickled file
         out = {
             'names': self.ndict,
             'loc': self.idict,
             'weights': self.wdict
         }
         AUX.save_pickle_gzip(self.cplocfile, out)
         final_names = []
         final_chr = []
         print("[STATUS] Getting order of names:")
         for chrom in self.chrlist:
             # NOTE: Copy exactly the name sort from merge bins:
             nlist = list(np.unique(self.ndict[chrom]))
             final_names = final_names + nlist
             final_chr = final_chr + [chrom] * len(nlist)
         # Write out final names as readable TSV:
         print("[STATUS] Writing names list order to: " + self.cpnamfile)
         ndf = pd.DataFrame({
             'name': final_names,
             'chr': final_chr,
             'cls': list(np.arange(len(final_names)) + 1)
         })
         ndf.to_csv(self.cpnamfile, sep='\t', index=False)
Beispiel #2
0
 def write_collapsed_matrix(self, pref):
     collfile = pref + "_collapsed_csr.cp.gz"
     NS = len(self.states)
     if not os.path.exists(collfile):
         # To COO, reduced representation:
         self.Xcoo = coo_matrix(self.X)
         col = self.Xcoo.col / NS
         col = col.astype('int')
         Xnew = coo_matrix((self.Xcoo.data, (self.Xcoo.row, col)),
                           (self.X.shape[0], int(self.X.shape[1] / NS)))
         Xcsr = csr_matrix(Xnew)
         print("[STATUS] Collapsed matrix to size: " + str(Xcsr.shape))
         AUX.save_pickle_gzip(collfile, Xcsr)
         print("[STATUS] Done writing out collapsed CSR")
Beispiel #3
0
 def process_chrom(self, chrom):
     print("[STATUS] Processing " + chrom)
     chrompref = self.out + "_" + chrom + self.re_mid
     if self.mergestates:
         chrompref = chrompref + "_merged"
     chrdata_file = chrompref + "_csr.cp.gz"
     chrattr_file = chrompref + "_attr.cp.gz"
     j = 0
     namelist = []
     for idnum in tqdm(self.pid):
         currid = self.ids[idnum]
         filepref = self.prefixes[idnum] + chrom + self.suffixes[idnum]
         maindir = self.dirs[idnum]
         self.verboseprint(currid + ": " + filepref)
         if self.intindex:
             # Get intersected matrix:
             [X, names] = self.get_mat_idlist(filepref, chrom, maindir)
         else:
             # Get full matrix for states
             [X, names] = self.get_mat(filepref, maindir)
         if type(names) != list:
             names = [names]
         # Concatenate:
         if j == 0:
             FULL = X
             namelist = names
             j = 1
         else:
             FULL = hstack([FULL, X])
             namelist = namelist + names
             # print("[STATUS] Current shape: " + str(FULL.shape))
             # print("[STATUS] Current names: " + str(namelist))
     # Print out dataset:
     attr = {'names': namelist}
     print("[STATUS] Writing chromosome " + chrom + " dataset to: " +
           chrdata_file)
     AUX.save_pickle_gzip(chrdata_file, FULL)
     AUX.save_pickle_gzip(chrattr_file, attr)
Beispiel #4
0
 def concatenate_chrom(self):
     print("[STATUS] Concatenating all chromosomes")
     out_pref = self.out + self.re_mid + "_allchr"
     if self.mergestates:
         out_pref = out_pref + "_merged"
     out_attr = out_pref + "_attr.cp.gz"
     full_names = None
     self.X = None
     for chrom in tqdm(self.chrlist):
         chrompref = self.out + "_" + chrom + self.re_mid
         if self.mergestates:
             chrompref = chrompref + "_merged"
         chrdata_file = chrompref + "_csr.cp.gz"
         chrattr_file = chrompref + "_attr.cp.gz"
         # data_file = self.out + "_" + chrom + self.re_mid
         # attr_file = data_file + "_attr.cp.gz"
         if not os.path.isfile(chrdata_file):
             self.process_chrom(chrom)
         X_chr = csr_matrix(AUX.load_file_save_sparse(chrompref))
         attr = AUX.load_pickle_gzip(chrattr_file)
         if full_names is None:
             full_names = attr['names']
         if attr['names'] != full_names:
             raise ValueError("Not the same names!")
         if self.X is None:
             self.X = X_chr
         else:
             self.X = vstack([self.X, X_chr])
     print("[STATUS] Writing final matrix")
     # Add indexes dictionary and save attributes:
     AUX.save_pickle_gzip(out_attr, attr)
     # Save as CSR sparse (both NPZ and CP)
     AUX.save_pickle_gzip(out_pref + "_csr.cp.gz", self.X)
     AUX.save_sparse_csr(out_pref + "_csr", self.X)
     if self.chromhmm and type(self.states) == list:
         self.write_collapsed_matrix(out_pref)
Beispiel #5
0
 def save_cpfiles(self, X, names, main, attr):
     print("[STATUS] Saving to: " + main)
     out = {'names': names}
     AUX.save_pickle_gzip(main, X)
     AUX.save_pickle_gzip(attr, out)