def read_bedgraph(sample_name, chrom_file): bedgraph_file_path = get_file_path(sample_name, BEDGRAPH_DIR) if bedgraph_file_path is None: print(f"No bedgraph file for {sample_name}") return None print(f"Reading in {bedgraph_file_path}") # return BedGraph(chrom_file, bedgraph_file_path, ignore_missing_bp=False, chrom_wanted='chr1') return BedGraph(chrom_file, bedgraph_file_path, ignore_missing_bp=False)
def find_loop_anchor_points(self, bedgraph: BedGraph): """ Finds the exact loop anchor points. Finds peak values for each anchor and weighs the loop. Also finds loops that have overlapping start/end indexes due to close and long start/end anchors. Parameters ---------- bedgraph : BedGraph Used to find the anchor points of each loop """ log.info(f'Finding anchor points for {self.sample_name}\'s {self.name}' f' from {bedgraph.name}') bedgraph.load_chrom_data(self.name) # Get index of peaks in every anchor interval self.start_list = bedgraph.stats(start_list=self.start_anchor_list[0], end_list=self.start_anchor_list[1], chrom_name=self.name, stat='max_index') self.end_list = bedgraph.stats(start_list=self.end_anchor_list[0], end_list=self.end_anchor_list[1], chrom_name=self.name, stat='max_index') # Get peak value for every anchor interval start_list_peaks = bedgraph.stats(start_list=self.start_anchor_list[0], end_list=self.start_anchor_list[1], chrom_name=self.name, stat='max') end_list_peaks = bedgraph.stats(start_list=self.end_anchor_list[0], end_list=self.end_anchor_list[1], chrom_name=self.name, stat='max') self.start_list_peaks = start_list_peaks self.end_list_peaks = end_list_peaks bedgraph.free_chrom_data(self.name) start_list_peaks = start_list_peaks / start_list_peaks.sum() end_list_peaks = end_list_peaks / end_list_peaks.sum() for i in range(self.numb_loops): # loop_start = self.start_list[i] # loop_end = self.end_list[i] # Remove anchors that have the same* peak # Keep indexes of loop length to avoid comparisons in interval # if not loop_start < loop_end: # self.value_list[i] = 0 # # # Removed interval goes from # # (start of start anchor, end of end anchor) # self.removed_intervals[0].append(self.start_anchor_list[0][i]) # self.removed_intervals[1].append(self.end_anchor_list[1][i]) # continue # Weigh each loop based on its corresponding bedgraph peak # peak_value = max(start_list_peaks[i], end_list_peaks[i]) peak_value = start_list_peaks[i] + end_list_peaks[i] self.value_list[i] *= peak_value self.max_loop_value = np.max(self.value_list) # Should be very small due to peaks being weighted earlier log.debug(f"Max loop weighted value: {self.max_loop_value}")
def __init__(self, chrom_size_file: str, loop_file: str, bedgraph: BedGraph, peak_dict: Dict[str, list], chroms_to_load: List[str] = None, min_loop_value: int = 0): """ Initializes all chromosomes and adds loops to them from given file. Finds peak max from bedgraph Parameters ---------- chrom_size_file : str File containing the base pair size of each chromosome to use loop_file : str File containing loops in format: chrom1 start1 end1 chrom2 start2 end2 pet_count bedgraph : BedGraph The bedgraph file for this sample (from pyBedGraph) peak_dict : dict[str, list] Key: Name of chromosome (chr1, chr2, ...) Value: List of peaks in chromosome Peak format: [start, end, length] chroms_to_load : list, optional List of names of chromosome to load (default is None) min_loop_value : int, optional Minimum loop value (PET count) to include (default is 0) """ # Prints peak_dict which is too large to be meaningful # log.debug(locals()) self.species_name = os.path.basename(chrom_size_file).split('.')[0] self.sample_name = os.path.basename(loop_file).split('.')[0] self.total_samples = 0 self.peak_dict = {} # Find values for each peak since peak caller is not accurate sometimes for chrom_name, peak_chrom in peak_dict.items(): if not bedgraph.has_chrom(chrom_name): log.warning(f'{bedgraph.name} does not have {chrom_name}') continue bedgraph.load_chrom_data(chrom_name) start_list = [x[0] for x in peak_chrom] end_list = [x[1] for x in peak_chrom] max_list = \ bedgraph.stats(start_list=start_list, end_list=end_list, chrom_name=chrom_name, stat='max') mean_list = \ bedgraph.stats(start_list=start_list, end_list=end_list, chrom_name=chrom_name, stat='mean') for i in range(max_list.size): peak_chrom[i].append(max_list[i]) peak_chrom[i].append(mean_list[i]) bedgraph.free_chrom_data(chrom_name) self.peak_dict[chrom_name] = peak_dict[chrom_name] # Initialize all chromosomes to be loaded self.chrom_dict = {} with open(chrom_size_file) as in_file: for line in in_file: line = line.strip().split() chrom_name = line[0] if chroms_to_load and chrom_name not in chroms_to_load: continue if chrom_name in CHROMS_TO_IGNORE: continue if chrom_name not in peak_dict: continue chrom_size = int(line[1]) self.chrom_dict[chrom_name] = \ ChromLoopData(chrom_name, chrom_size, self.sample_name) with open(loop_file) as in_file: loop_anchor_list = [] for line in in_file: line = line.strip().split() chrom_name = line[0] if chrom_name not in self.chrom_dict: continue loop_value = int(line[6]) if loop_value < min_loop_value: continue # head interval loop_start1 = int(line[1]) loop_end1 = int(line[2]) # tail anchor loop_start2 = int(line[4]) loop_end2 = int(line[5]) self.chrom_dict[chrom_name].add_loop(loop_start1, loop_end1, loop_start2, loop_end2, loop_value) head_interval = loop_end1 - loop_start1 tail_interval = loop_end2 - loop_start2 loop_anchor_list.append(head_interval) loop_anchor_list.append(tail_interval) log.debug(f'Anchor mean width: {np.mean(loop_anchor_list)}') # Get rid of chroms that had problems initializing to_remove = [] for chrom_name in self.chrom_dict: if self.chrom_dict[chrom_name].finish_init(bedgraph): self.total_samples += \ np.sum(self.chrom_dict[chrom_name].value_list) else: to_remove.append(chrom_name) # Chromosomes with no loops or other random problems for chrom_name in to_remove: del self.chrom_dict[chrom_name]
output = key + "\n" + " ".join([str(x) for x in run_time_results[key]]) + '\n' out.write(output) # generate_images.create_runtime_num_test(data_name, num_test_list, run_time_results) if len(sys.argv) != 3: print("Needs 2 arguments:\n" "arg 1 - chrom_sizes_file\n" "arg 2 - bigWig file") exit(-1) chrom_name = 'chr1' start_time = time.time() bedGraph = BedGraph(sys.argv[1], sys.argv[2], chrom_name) print("Time for loading bedGraph file: ", time.time() - start_time) start_time = time.time() print(f"Time for loading {chrom_name}: ", time.time() - start_time, '\n') bench = Benchmark(bedGraph, sys.argv[2]) data_name = Path(sys.argv[2]).stem if not os.path.isdir(f'graphs'): os.mkdir(f'graphs') if not os.path.isdir(f'graphs/{data_name}'): os.mkdir(f'graphs/{data_name}') # runtime_benchmark() interval_size_error_benchmark()
from pyBedGraph import BedGraph import os import csv FOLDER_LOC = '/media/hirow/extra/jax/data/pybedgraph' stats = [] for folder in os.listdir(FOLDER_LOC): for filename in os.listdir(f'{FOLDER_LOC}/{folder}'): print(folder, filename) bedgraph = BedGraph( f'/media/hirow/extra/jax/data/chrom_sizes/{folder}.chrom.sizes', f'{FOLDER_LOC}/{folder}/{filename}', 'chr1') sample_name = filename.split('.')[0] sample = {} sample['name'] = sample_name chrom = bedgraph.chromosome_map['chr1'] sample['total_coverage'] = chrom.total_coverage sample['num_samples'] = chrom.num_samples sample['avg_chrom_value'] = chrom.avg_chrom_value sample['avg_interval_value'] = chrom.avg_interval_value sample['avg_interval_size'] = chrom.avg_interval_size sample['num_intervals'] = chrom.num_intervals stats.append(sample) csv_columns = list(stats[0].keys()) with open('bedgraph_stats.csv', 'w') as csv_file:
import time import pyBedGraph from pyBedGraph import BedGraph print(f'Using {pyBedGraph.__file__}') DEBUG = False try: bedGraph = BedGraph('test_files/hg38.chrom.sizes', 'test_files/ENCFF376VCU.bigWig', ['chr14']) assert False except RuntimeError: print("Passed giving wrong chrom size test!") start_time = time.time() bedGraph = BedGraph('test_files/mm10.chrom.sizes', 'test_files/ENCFF376VCU.bedGraph', debug=DEBUG) print(f"Loading ENCFF376VCU.bedgraph took {time.time() - start_time}") # Takes 170 seconds on i5-7300HQ bedGraph.load_chrom_data('chr1') bedGraph.load_chrom_bins('chr1', 100) bedGraph.load_chrom_data('chr4') bedGraph.load_chrom_bins('chr4', 100) if DEBUG: total_num_intervals = 0 avg_interval_sizes = {'chr1': 26.447609, 'chr10': 25.53135}
def read_data(input_data_file: str, chrom_size_file: str, min_loop_value: int = 1, min_bedgraph_value: int = 1, chroms_to_load: List[str] = None, use_bigwig: bool = False, output_dir: str = 'output') -> Dict[str, GenomeLoopData]: """ Reads all samples that are found in loop_data_dir. loop_data_dir/peak_data_dir/bedgraph_data_dir do not have to be separate directories. Parameters ---------- input_data_file : str File with file paths to all necessary input files. Format: sample1_name bedgraph1_file peak1_file loop2_file sample2_name bedgraph2_file peak2_file loop1_file ... chrom_size_file : str Path to chromosome size file min_loop_value : int, optional Minimum loop value accepted by GenomeLoopData/ChromLoopData min_bedgraph_value : int, optional Minimum value accepted by BedGraph obj from pyBedGraph chroms_to_load : list, optional Specify specific chromosomes to load instead of the entire genome use_bigwig : bool, optional Specify if input_file is bigwig or not. Not implemented yet. output_dir : str Directory to output data Returns ------- OrderedDict[str, GenomeLoopData] """ total_start_time = time.time() os.makedirs(f'{output_dir}/timings', exist_ok=True) sample_data_dict = OrderedDict() if not os.path.isfile(chrom_size_file): log.error(f"Chrom size file: {chrom_size_file} is not a valid file") return sample_data_dict if not os.path.isfile(input_data_file): log.error(f"Data file: {input_data_file} is not a valid file") return sample_data_dict # Get input file names input_sample_files = [] with open(input_data_file) as in_file: for line in in_file: sample_files = line.split() if len(sample_files) != 4: log.error(f"Invalid number of columns in {input_data_file}") return sample_data_dict input_sample_files.append(sample_files) sample_timings = OrderedDict() for sample_files in input_sample_files: sample_start_time = time.time() sample_name = sample_files[0] bedgraph_file = sample_files[1] peak_file = sample_files[2] loop_file = sample_files[3] # Check for file validity invalid_file = False for i in range(1, 4): if not os.path.isfile(sample_files[i]): log.error(f"Data file: {sample_files[i]} is not a valid file") invalid_file = True break if invalid_file: continue log.info(f'Loading {sample_name} ...') peak_dict = read_peak_file(peak_file) bedgraph = BedGraph(chrom_size_file, bedgraph_file, chroms_to_load=chroms_to_load, ignore_missing_bp=False, min_value=min_bedgraph_value) gld = GenomeLoopData(chrom_size_file, loop_file, bedgraph, peak_dict, min_loop_value=min_loop_value, chroms_to_load=chroms_to_load) sample_data_dict[sample_name] = gld sample_timings[sample_name] = time.time() - sample_start_time with open(f'{output_dir}/timings/read_data.txt', 'w') as out_file: out_file.write(f'sample_name\ttime_taken\n') for sample_name, sample_timing in sample_timings.items(): out_file.write(f'{sample_name}\t{sample_timing}\n') out_file.write(f'total\t{time.time() - total_start_time}\n') return sample_data_dict
def mainfn(args): bedgraph = args.p2bedgraph expr_name = args.expr_name chrom_size = args.p2chrom p2annot_loop = args.p2loop_annot p2loop_tag = args.p2loop_tag nbins = args.nbins p2save_dir = args.p2save_dir pseudo = args.pseudo p2bedgraph = os.path.join(bedgraph, expr_name) bg = BedGraph(chrom_size, p2bedgraph) annot_columns = [ 'left_chr', 'left_start', 'left_end', 'right_chr', 'right_start', 'right_end', 'PET count', 'left_max_intensity', 'right_max_intensity', 'left_max_index', 'right_max_index', 'loop_ID', 'left_motif_chr', 'left_motif_start', 'left_motif_end', 'left_motif_strand', 'left_distance', 'right_motif_chr', 'right_motif_start', 'right_motif_end', 'right_motif_strand', 'right_distance' ] df_loop = pd.read_csv(p2annot_loop, names=annot_columns, sep='\t') chromfile = pd.read_table(chrom_size, names=['chrom', 'size']) for row in chromfile.iterrows(): chrom_name = row[1]['chrom'] if chrom_name not in df_loop['left_chr'].values: continue bg.load_chrom_data(chrom_name) if pseudo == 0: loop_tag = pd.read_csv(p2loop_tag, sep='\t', index_col=0) elif pseudo == 1: # def drop_exceed_loop(df_loop_row, chromfile): # chrom_name = df_loop_row['left_chr'] # chrom_len = chromfile[chromfile['chrom'] == chrom_name]['size'].values # loop_end = df_loop_row['right_end'] # return loop_end > chrom_len # tmp_fn = lambda x: drop_exceed_loop(x, chromfile) # df_loop['exceed_chorm'] = df_loop.apply(tmp_fn, axis = 1) # df_loop = df_loop[df_loop['exceed_chorm'] == False] # df_loop = df_loop.reset_index() loop_tag = pd.DataFrame(index=df_loop.index, columns=['bias', 'convergence', 'NULL motif']) loop_tag.bias = 'balance' loop_tag.convergence = 'convergent' loop_tag['NULL motif'] = 'na' df_binned_intensity_per_loop = pd.DataFrame( index=df_loop.index, columns=['bias', '{} binned intensity'.format(nbins)]) df_binned_intensity_per_loop['bias'] = loop_tag['bias'] tmp_df = df_loop.apply(lambda x: get_max_intensity_in_same_len_bins( bg, nbins, x.left_start, x.left_chr, x.right_end, x.right_chr, chrom_size=chromfile[chromfile['chrom'] == x.left_chr]['size']), axis=1) df_binned_intensity_per_loop['{} binned intensity'.format(nbins)] = tmp_df df_binned_intensity_per_loop['convergence'] = loop_tag['convergence'] df_binned_intensity_per_loop['NULL motif'] = loop_tag['NULL motif'] df_binned_intensity_per_loop['chrom'] = df_loop['left_chr'] binned_intensity_per_loop_name = 'binned_results_{}'.format(expr_name) if pseudo == 1: binned_intensity_per_loop_name = 'pseudo_' + binned_intensity_per_loop_name if not os.path.isdir(p2save_dir): os.makedirs(p2save_dir) p2binned_intensity_per_loop = os.path.join(p2save_dir, binned_intensity_per_loop_name) df_binned_intensity_per_loop.to_pickle(p2binned_intensity_per_loop) norm_df_binned_intensity_per_loop = df_binned_intensity_per_loop.copy() binned_name_list = [] for name in df_binned_intensity_per_loop.columns: if 'binned intensity' in name: binned_name_list.append(name) norm_fn = lambda x: x / max(x) norm_df_binned_intensity_per_loop[ name] = norm_df_binned_intensity_per_loop[name].apply(norm_fn) if args.norm == 0: df_agg_sum, df_agg_mean, df_agg_var = get_aggregated_inten_for_each_class( df_binned_intensity_per_loop, nbins=nbins, catag='bias') else: df_agg_sum, df_agg_mean, df_agg_var = get_aggregated_inten_for_each_class( norm_df_binned_intensity_per_loop, nbins=nbins, catag='bias') for label in df_agg_mean.columns: fig_name = 'norm_sum_agg_plot_{}_{}'.format(label.replace(' ', '_'), expr_name) if pseudo == 1: fig_name = 'pseudo_' + fig_name p2avg_fig = os.path.join(p2save_dir, 'aggregated_plots', fig_name) # aggre_by_mean_var(df_agg_mean, df_agg_var, label=label, # chrom='whole genome', scilent=True, # p2f=p2avg_fig) aggre_by_sum(df_agg_sum, label=label, chrom='whole genome', scilent=True, p2f=p2avg_fig)
import sys sys.path.append("../..") from pyBedGraph import BedGraph # arg1 - chromosome sizes file # arg2 - bedgraph file # arg3 - (optional) chromosome_name # Just load chromosome 'chr1' (uses less memory and takes less time) bedGraph = BedGraph('myChrom.sizes', 'random_test.bedGraph', 'chr1') # Load the whole bedGraph file bedGraph = BedGraph('myChrom.sizes', 'random_test.bedGraph', 'chr1') # Option to not ignore missing basePairs when calculating statistics # Used the exact same way but produces slightly different results inclusive_bedGraph = BedGraph('myChrom.sizes', 'random_test.bedGraph', ignore_missing_bp=False) bedGraph.load_chrom_data('chr1') inclusive_bedGraph.load_chrom_data('chr1') bedGraph.load_chrom_bins('chr1', 3) inclusive_bedGraph.load_chrom_bins('chr1', 3) import numpy as np # Option 1 test_intervals = [['chr1', 24, 26], ['chr1', 12, 15], ['chr1', 8, 12], ['chr1', 9, 10], ['chr1', 0, 5]] values = bedGraph.stats(intervals=test_intervals)
def main_fn(args): p2loop_file = args.p2loop_file p2bedgraph = args.p2bedgraph p2save_loop_tag = args.p2save_loop_tag nbins = args.nbins p2chrom_size = args.p2chrom_size p2binned_intensity_per_loop = args.p2binned_intensity_per_loop p2agg_stats = args.p2agg_stats annot_col_names = [ 'left_chr', 'left_start', 'left_end', 'right_chr', 'right_start', 'right_end', 'PET count', 'left_max_intensity', 'right_max_intensity', 'left_max_index', 'right_max_index', 'loop_ID', 'left_motif_chr', 'left_motif_start', 'left_motif_end', 'left_motif_strand', 'left_distance', 'right_motif_chr', 'right_motif_start', 'right_motif_end', 'right_motif_strand', 'right_distance' ] conv_dict = { '+-': 'convergence', '-+': 'divergence', '++': 'right tandem', '--': 'left tandem' } null_dict = { '.+': 'NULL-right', '.-': 'NULL-left', '-.': 'left-NULL', '+.': 'right-NULL', '..': 'NULL' } df_loop = pd.read_table(p2loop_file, names=annot_col_names) loop_tag = pd.DataFrame(columns=['bias', 'convergence', 'NULL motif'], index=df_loop.index) loop_tag['bias'] = df_loop.apply(lambda x: binomial_test_fn( x.left_max_intensity, x.right_max_intensity), axis=1) loop_tag['convergence'] = df_loop.apply(lambda x: motif_convergence_fn( x.left_motif_strand, x.right_motif_strand, conv_dict), axis=1) loop_tag['NULL motif'] = df_loop.apply(lambda x: find_NULL_motif( x.left_motif_strand, x.right_motif_strand, null_dict), axis=1) # save loop tag and added label loop annotation file. df_loop_new = df_loop.copy() df_loop_new[['bias', 'convergence', 'NULL motif' ]] = loop_tag[['bias', 'convergence', 'NULL motif']] loop_tag.to_csv(p2save_loop_tag, sep='\t') p2labeled_loop = p2loop_file + '_added_labels' df_loop_new.to_csv(p2labeled_loop, sep='\t') whole_genome_balance_count = (loop_tag['bias'] == 'balance').sum() whole_genome_left_biased_count = (loop_tag['bias'] == 'left biased').sum() whole_genome_right_biased_count = ( loop_tag['bias'] == 'right biased').sum() # aggregate bias chrom_list = list( set(df_loop['left_chr']).union(set(df_loop['right_chr']))) chrom_list.sort(key=lambda x: int(x[3:]) if x != 'chrX' else 24) chrom_list.append('whole genome') df_bias_count = pd.DataFrame(columns=[ 'balance_loop_count', 'balance_PET_count', 'left_biased_loop_count', 'left_biased_PET_count', 'right_biased_loop_count', 'right_biased_PET_count', ], index=chrom_list) for chrom in chrom_list[:-1]: chrom_loop_idx = (df_loop['left_chr'] == chrom) balance_tag_idx = (loop_tag['bias'] == 'balance') left_bias_tag_idx = (loop_tag['bias'] == 'left biased') right_bias_tag_idx = (loop_tag['bias'] == 'right biased') chrom_balance_idx = (balance_tag_idx & chrom_loop_idx) chrom_left_biased_idx = (left_bias_tag_idx & chrom_loop_idx) chrom_right_biased_idx = (right_bias_tag_idx & chrom_loop_idx) chrom_balance_count = chrom_balance_idx.sum() chrom_left_biased_count = chrom_left_biased_idx.sum() chrom_right_biased_count = chrom_right_biased_idx.sum() chrom_balance_PET = df_loop.loc[chrom_balance_idx]['PET count'].sum() chrom_left_biased_PET = df_loop.loc[chrom_left_biased_idx][ 'PET count'].sum() chrom_right_biased_PET = df_loop.loc[chrom_right_biased_idx][ 'PET count'].sum() df_bias_count.loc[chrom] = { 'balance_loop_count': chrom_balance_count, 'balance_PET_count': chrom_balance_PET, 'left_biased_loop_count': chrom_left_biased_count, 'left_biased_PET_count': chrom_left_biased_PET, 'right_biased_loop_count': chrom_right_biased_count, 'right_biased_PET_count': chrom_right_biased_PET } df_bias_count.loc['whole genome'] = df_bias_count.loc[chrom_list[:-1]].sum( axis=0) df_bias_count['loop_count_proportion_blr'] = df_bias_count.apply( lambda x: count_proportion_fn(x, 'balance_loop_count', 'left_biased_loop_count', 'right_biased_loop_count'), axis=1) df_bias_count['PET_count_proportion_blr'] = df_bias_count.apply( lambda x: count_proportion_fn(x, 'balance_PET_count', 'left_biased_PET_count', 'right_biased_PET_count'), axis=1) p2df_bias_count = p2agg_stats + '_bias_count.csv' df_bias_count.to_csv(p2df_bias_count) # aggregate convergence results. conv_column_list = [ 'convergence_loop_count', 'convergence_PET_count', 'divergence_loop_count', 'divergence_PET_count', 'left_tandem_loop_count', 'left_tandem_PET_count', 'right_tandem_loop_count', 'right_tandem_PET_count' ] df_convergence_count = pd.DataFrame(columns=conv_column_list, index=chrom_list) for chrom in chrom_list[:-1]: chrom_loop_idx = (df_loop['left_chr'] == chrom) convergence_tag_idx = (loop_tag['convergence'] == 'convergence') divergence_tag_idx = (loop_tag['convergence'] == 'divergence') left_tendem_tag_idx = (loop_tag['convergence'] == 'left tandem') right_tendem_tag_idx = (loop_tag['convergence'] == 'right tandem') chrom_convergence_idx = (convergence_tag_idx & chrom_loop_idx) chrom_divergence_idx = (divergence_tag_idx & chrom_loop_idx) chrom_left_tendem_idx = (left_tendem_tag_idx & chrom_loop_idx) chrom_right_tendem_idx = (right_tendem_tag_idx & chrom_loop_idx) chrom_convergence_count = chrom_convergence_idx.sum() chrom_divergence_count = chrom_divergence_idx.sum() chrom_left_tendem_count = chrom_left_tendem_idx.sum() chrom_right_tendem_count = chrom_right_tendem_idx.sum() chrom_convergence_PET = df_loop.loc[chrom_convergence_idx][ 'PET count'].sum() chrom_divergence_PET = df_loop.loc[chrom_divergence_idx][ 'PET count'].sum() chrom_left_tendem_PET = df_loop.loc[chrom_left_tendem_idx][ 'PET count'].sum() chrom_right_tendem_PET = df_loop.loc[chrom_right_tendem_idx][ 'PET count'].sum() count_list = [ chrom_convergence_count, chrom_convergence_PET, chrom_divergence_count, chrom_divergence_PET, chrom_left_tendem_count, chrom_left_tendem_PET, chrom_right_tendem_count, chrom_right_tendem_PET ] df_convergence_count.loc[chrom] = dict( zip(conv_column_list, count_list)) df_convergence_count.loc['whole genome'] = df_convergence_count.loc[ chrom_list[:-1]].sum(axis=0) df_convergence_count[ 'PET_count_proportion_cdlr'] = df_convergence_count.apply( lambda x: convergence_proportion_fn( x, 'convergence_PET_count', 'divergence_PET_count', 'left_tandem_PET_count', 'right_tandem_PET_count'), axis=1) p2df_convergence_count = p2agg_stats + '_convergence_count.csv' df_convergence_count.to_csv(p2df_convergence_count) # aggregate NULL motif. NULL_name_list = list(set(loop_tag['NULL motif'])) NULL_name_list.sort() NULL_column_list = [] for n in NULL_name_list: if n == 'na': continue NULL_column_list.append('{}_loop_count'.format(n)) NULL_column_list.append('{}_PET_count'.format(n)) df_NULL_count = pd.DataFrame(columns=NULL_column_list, index=chrom_list) for chrom in chrom_list[:-1]: chrom_loop_idx = (df_loop['left_chr'] == chrom) NULL_val_list = [] for n in NULL_column_list: cur_type = n.split('_')[0] cur_tag_idx = (loop_tag['NULL motif'] == cur_type) chrom_cur_tag_idx = (cur_tag_idx & chrom_loop_idx) if n.split('_')[1] == 'loop': chrom_cur_count = chrom_cur_tag_idx.sum() elif n.split('_')[1] == 'PET': chrom_cur_count = df_loop.loc[chrom_cur_tag_idx][ 'PET count'].sum() NULL_val_list.append(chrom_cur_count) df_NULL_count.loc[chrom] = dict(zip(NULL_column_list, NULL_val_list)) df_NULL_count.loc['whole genome'] = df_NULL_count.loc[ chrom_list[:-1]].sum() loop_count_name_list = [x for x in NULL_column_list if 'loop' in x] df_NULL_count['loop_nn_nl_nr_ln_rn'] = df_NULL_count.apply( lambda x: NULL_proportion_fn(x, loop_count_name_list), axis=1) PET_count_name_list = [x for x in NULL_column_list if 'PET' in x] df_NULL_count['PET_nn_nl_nr_ln_rn'] = df_NULL_count.apply( lambda x: NULL_proportion_fn(x, PET_count_name_list), axis=1) p2df_NULL_count = p2agg_stats + '_NULL_motif_count.csv' df_NULL_count.to_csv(p2df_NULL_count) # READ bedgraph file and get intensity ipdb.set_trace() bg = BedGraph(p2chrom_size, p2bedgraph) chromfile = pd.read_table(p2chrom_size, names=['chrom', 'size']) for row in chromfile.iterrows(): bg.load_chrom_data(row[1]['chrom']) bin_name = '{} binned intensity'.format(nbins) df_binned_intensity_per_loop = pd.DataFrame(index=df_loop.index, columns=['bias', bin_name]) df_binned_intensity_per_loop['bias'] = loop_tag['bias'] my_bg = bg tmp_df = df_loop.apply(lambda x: get_max_intensity_in_same_len_bins( my_bg, nbins, x.left_start, x.left_chr, x.right_end, x.right_chr), axis=1) df_binned_intensity_per_loop[bin_name] = tmp_df df_binned_intensity_per_loop['convergence'] = loop_tag['convergence'] df_binned_intensity_per_loop['NULL motif'] = loop_tag['NULL motif'] df_binned_intensity_per_loop['chrom'] = df_loop['left_chr'] df_binned_intensity_per_loop.to_pickle(p2binned_intensity_per_loop)