def getMatrix4plot(self, interval, binsize = None): def appendSeries2matrix(x,matrix,triangle = "both"): #x - series to append into matrix if triangle == "both": matrix[x["contact_en_bin"], x["contact_st_bin"]] = x["contact_count"] matrix[x["contact_st_bin"], x["contact_en_bin"]] = x["contact_count"] elif triangle == "upper": matrix[x["contact_st_bin"], x["contact_en_bin"]] = x["contact_count"] elif triangle == "lower": matrix[x["contact_en_bin"], x["contact_st_bin"]] = x["contact_count"] else: raise try: self.data except: logging.error("Please provide the data first") return if binsize == None: #infer binsize from data # dist = pd.unique(self.data["contact_en"]-self.data["contact_st"]) # sorted_starts = np.sort(self.data["contact_st"].values[:min(1000,len(self.data))]) # dist2 = np.unique(np.subtract(sorted_starts[1:],sorted_starts[:-1])) # assert (dist2 >= 0).all() # dist = np.unique(np.concatenate((dist,dist2))) # dist = dist[np.nonzero(dist)] # assert len(dist) > 0 # binsize = min(dist) binsize = get_bin_size(self.data) logging.getLogger(__name__).info("Using binsize "+str(binsize)) Interval_size_bins = (interval.end - interval.start) // binsize + 1 matrix = np.zeros(shape=(Interval_size_bins, Interval_size_bins)) data = self.data.query("@interval.start <= contact_st <= @interval.end &" "@interval.start <= contact_en <= @interval.end") data = self.convert2binned(data, interval, binsize) try: len(self.control) with_control = True except: with_control = False if with_control: logging.getLogger(__name__).debug("Running with control") control = self.control.query("@interval.start <= contact_st <= @interval.end &" "@interval.start <= contact_en <= @interval.end") control = self.convert2binned(control, interval, binsize) data.apply(appendSeries2matrix,matrix=matrix,triangle="upper",axis = "columns") control.apply(appendSeries2matrix,matrix=matrix,triangle="lower",axis = "columns") else: data.apply(appendSeries2matrix,matrix=matrix,triangle="both",axis = "columns") #remember values for future operations self.matrix = matrix self.binsize = binsize self.interval_size_bins = Interval_size_bins self.interval = interval return matrix
def scc(self, validation_data, predicted, out_dir, **kwargs): # if self.apply_log: # print(validation_data["contact_count"]) # print(predicted) # validation_data["contact_count"] = validation_data["contact_count"].apply(lambda x: math.exp(x)) # predicted = np.exp(np.array(predicted)) # print(validation_data["contact_count"]) # print(predicted) # print(validation_data["chr"]) chromosome = str(validation_data["chr"][1]) binsize = str(get_bin_size(validation_data)) # print("chromosome", chromosome) if "h" not in kwargs: kwargs["h"] = 2 else: logging.info("for scc using h = " + str(kwargs["h"])) if "loop_file" not in kwargs: d = pd.concat([ validation_data["contact_st"], validation_data["contact_en"], validation_data["contact_count"], pd.DataFrame(predicted) ], axis=1) else: add_loop(validation_data, kwargs["loop_file"]) d = pd.concat([ validation_data["contact_st"], validation_data["contact_en"], validation_data["contact_count"], pd.DataFrame(predicted), validation_data["IsLoop"] ], axis=1) in_fname = os.path.join( out_dir + "scc/", chromosome + "." + binsize + "." + self.__represent_validation__()) + ".scc" out_fname = in_fname + ".out" pd.DataFrame.to_csv(d, in_fname, sep=" ", index=False) logging.info(datetime.datetime.now()) if "p_file" not in kwargs or "e_file" not in kwargs: if "interact_pr_en" not in kwargs: out = subprocess.check_output([ "Rscript", kwargs["scc_file"], in_fname, out_fname, str(kwargs["h"]), chromosome ]) else: out = subprocess.check_output([ "Rscript", kwargs["scc_file"], in_fname, out_fname, str(kwargs["h"]), chromosome, kwargs["interact_pr_en"] ]) else: out = subprocess.check_output([ "Rscript", kwargs["scc_file"], in_fname, out_fname, str(kwargs["h"]), chromosome, kwargs["p_file"], kwargs["e_file"] ]) print(str(out))
def MatPlot2HiC(matplot_obj, fname, out_folder): def Pandas2ChrSizes( chrsizes_filename, pandas_df ): # This func takes all the chromosomes from pandas object, find out their sizes and write into file chromosomes = pandas_df.ix[:, 0].unique() chrsizes_table = pd.DataFrame(columns=chromosomes) for i in range(len(chromosomes)): buf = pandas_df.loc[pandas_df['chr'] == chromosomes[i]][[ 'contact_st', 'contact_en' ]] max1 = buf.max().max() chrsizes_table.at[0, chromosomes[i]] = max1 print('Completed: {}%'.format(i * 100 // len(chromosomes)), end='\r') chr_list = list(chrsizes_table) chrsizes_file = open(chrsizes_filename, 'w') for j in range(len(chr_list)): chrsizes_file.write(chr_list[j] + '\t' + str(chrsizes_table.iloc[0][chr_list[j]]) + '\n') chrsizes_file.close() def Pandas2Pre( pre_filename, pandas_df ): # This func makes pre-HiC file from the pandas object, control or data pre_file = open(pre_filename, 'w') data_rows = pandas_df.shape[0] pandas_df.columns = ["chr1", "start", "end", "count"] pandas_df['str1'] = 0 assert len(pandas_df.loc[(pandas_df['count'] < 0.000001) & (pandas_df['count'] != 0)]) < ( len(pandas_df['count']) / 10) pandas_df['exp'] = pandas_df['count'] * (1000000) pandas_df['exp'] = round(pandas_df['exp']).astype(int) pandas_df.to_csv(pre_file, sep=" ", columns=[ 'str1', 'chr1', 'start', 'start', 'str1', 'chr1', 'end', 'end', 'exp' ], header=False, index=False) pre_file.close() # make dirs try: os.makedirs(out_folder + '/' + fname) os.makedirs(out_folder + '/' + fname + '/pre') os.makedirs(out_folder + '/' + fname + '/hic') except OSError as e: if e.errno != errno.EEXIST: raise # make filenames chromsizes_filename = out_folder + '/' + fname + '/pre/chrom.sizes' pre_data_filename = out_folder + '/' + fname + '/pre/pre_data.txt' hic_data_filename = out_folder + '/' + fname + '/hic/data.hic' pre_control_filename = out_folder + '/' + fname + '/pre/pre_control.txt' hic_control_filename = out_folder + '/' + fname + '/hic/control.hic' # make chrom.sizes, pre-Hic for data and control print('Make chromosome sizes file...') time1 = time.time() Pandas2ChrSizes(chromsizes_filename, matplot_obj.data) time2 = time.time() print('Time: ' + str(round(time2 - time1, 3)) + ' sec\n') print(colored("[SUCCESS]", 'green') + ' Chromosome sizes file created.\n') print('Make data pre-HiC file...') time1 = time.time() Pandas2Pre(pre_data_filename, matplot_obj.data) time2 = time.time() print('Time: ' + str(round(time2 - time1, 3)) + ' sec\n') print(colored("[SUCCESS]", 'green') + ' DATA pre-HiC file created.\n') print('Make control pre-HiC file...') time1 = time.time() Pandas2Pre(pre_control_filename, matplot_obj.control) time2 = time.time() matplot_obj.columns = ["chr1", "start", "end", "count"] binsize = get_bin_size(matplot_obj.control, fields=["start", "start"]) print('Time: ' + str(round(time2 - time1, 3)) + ' sec\n') print(colored("[SUCCESS]", 'green') + ' CONTROL pre-HiC file created.\n') #call juicer subprocess.call([ 'java', '-jar', './juicer_tools.jar', 'pre', pre_data_filename, hic_data_filename, chromsizes_filename, '-n', '-r', binsize ]) print(colored("[SUCCESS]", 'green') + ' DATA HiC file created.\n') subprocess.call([ 'java', '-jar', './juicer_tools.jar', 'pre', pre_control_filename, hic_control_filename, chromsizes_filename, '-n', '-r', binsize ]) print(colored("[SUCCESS]", 'green') + ' CONTROL HiC file created.\n')
def __init__(self, filename): # def add_contact(series): # i = int((series["contact_st"] - data_min) // binsize) # j = int((series["contact_en"]- data_min) // binsize) # predicted = series["contact_count"] # real = series["0"] # matrix[i,j] = predicted # matrix[j,i] = real logging.basicConfig(format='%(asctime)s %(name)s: %(message)s', datefmt='%I:%M:%S', level=logging.DEBUG) assert np.iinfo(np.dtype("uint32")).max > 250000000 data = pd.read_csv(filename, sep=" ", dtype={ "contact_st": np.uint32, "contact_en": np.uint32, "contact_count": np.float32, "0": np.float32, "IsLoop": np.uint8 }) logging.getLogger(__name__).debug("Reading data") data_min, data_max = data["contact_st"].min(), data["contact_en"].max() binsize = int(get_bin_size(data)) logging.getLogger(__name__).debug("Using bin size " + str(binsize)) assert data_min % binsize == data_max % binsize == 0 assert data_max > data_min chr_len = int((data_max - data_min) // binsize) logging.getLogger(__name__).debug("Data size: " + str(chr_len) + " bins") matrix = np.zeros(shape=(chr_len + 1, chr_len + 1)) # Fill matrix logging.getLogger(__name__).debug("Filling matrix") data["contact_st"] = ((data["contact_st"] - data_min) // binsize).astype(int) data["contact_en"] = ((data["contact_en"] - data_min) // binsize).astype(int) i = data["contact_st"].values j = data["contact_en"].values matrix[(i, j)] = data["contact_count"].values matrix[(j, i)] = data["0"].values #data.apply(add_contact,axis = "columns") diag_sums = [np.trace(matrix, i) for i in range(len(matrix))] assert diag_sums[0] + diag_sums[ 1] == 0 # check that first 2 diagonals are empty assert np.trace(matrix, 1500000 // binsize - 1) != 0 # check that we have disctances up to 1.5 Mb #TODO apply transformations to data here logging.getLogger(__name__).debug("Normalizing data") mean = -100 # find mean along first non-zero diagonal for ind, val in enumerate(diag_sums): if val != 0: mean = val / (len(matrix) - ind) if mean - 1 <= 1: # For oe values mean is ~1 logging.getLogger(__name__).info( "Assuming contacts, going to convert to o/e values.") for i in range(len(matrix)): if diag_sums[i] == 0: continue else: for j in range(i, len(matrix)): matrix[j - i, j] = matrix[j - i, j] / diag_sums[i] matrix[j, j - i] = matrix[j, j - i] / diag_sums[i] self.split2pairs(matrix, length=1500000 // binsize, step=750000 // binsize)
}) print(data.head()) assert np.all((data["end"] - data["st"]).values >= 0) #print (data.max(),data.min()) null_values_count = data.isnull().values.sum() if null_values_count != 0: print("Found null values") print(null_values_count, " out of ", len(data)) data = data.dropna() assert not data.isnull().values.any() assert not data.isna().values.any() binsize = get_bin_size(data, fields=["st", "end"]) data["end"] = data["end"] // binsize data["st"] = data["st"] // binsize groupped = data.groupby(by="chr") boundaries = [] length = 1500000 // binsize step = length // 5 # end = min(start+length+step*10,len(array) - length) for chr, groupped_data in groupped: start = 0 print("Computing chr ", chr) result = get_compartments(groupped_data, binsize, start, length, step)