def roi_meth(in_bed_prefixes, in_sample_list, out_table, mask_file, roi_file, min_read_count, min_cpg_count, min_file_count, raw_data_name, thread_count): """ Creates a table with the methylation across desired Regions of Interest (ROI). :param in_bed_prefixes: list of bed file prefixes. :param in_sample_list: list of sample names. Order corresponds with in_bed_prefixes. :param out_table: name of output table file. :param mask_file: bed or gtf file that will contain areas masked from analysis (ie: any areas in this file will be ignored). :param roi_file: bed or gtf file containing the areas of the genome you want analyzed. :param min_read_count: minimum read count necessary for a region of interest. If a sample has less than this read count, NA will be input instead of the average methylation over the ROI. :param min_cpg_count: minimum CpG count necessary for a region of interest. If a sample has less than this read count, NA will be input instead of the average methylation over the ROI. :param min_file_count: minimum file count to keep a region of interest. If less than this many files/samples meet the previous minimum requirements, that roi will not have output in your out_table file. :param raw_data_name: optional file that (if populated) will be the output of methylated and total read counts for each sample. The minimums still apply and will work the same as the main file. :param thread_count: int designating threads to allocate for multithreading. :return: Nothing """ # Reduces thread count if there aren't enough tasks to fill all threads if len(in_bed_prefixes) < thread_count: thread_count = len(in_bed_prefixes) outfile = open(out_table, 'wb') header_line = 'chrom\tstart\tend\tname' for samp in in_sample_list: header_line = '{}\t{}'.format(header_line, samp) header_line = '{}\n'.format(header_line) outfile.write(header_line) if raw_data_name != "": raw_data = open(raw_data_name, 'wb') header_line = 'chrom\tstart\tend\tname' for samp in in_sample_list: header_line = '{0}\t{1}_methylated\t{1}_total\t{1}_cpgs'\ .format(header_line, samp) header_line = '{}\n'.format(header_line) raw_data.write(header_line) roi = BedTool(roi_file) if mask_file != "": mask = BedTool(mask_file) else: mask = BedTool([('chrNONE', 0, 0)]) # Get chromosome names in ROI file logging.info('Loading chromosomes:') chrom_names_tmp = [] for line in roi: chrom = utilities.show_value(line.chrom) if chrom not in chrom_names_tmp: chrom_names_tmp.append(chrom) # Remove chromosome names without accompanying PerMeth file chrom_names = [] for chrom in chrom_names_tmp: keepchrom = True for pm_sample in in_bed_prefixes: permeth_name = '{}{}.bed'.format(pm_sample, chrom) if not os.path.exists(permeth_name): permeth_name = '{}{}.bed.gz'.format(pm_sample, chrom) if not os.path.exists(permeth_name): # logging.warning('Cannot access a file for {}, skipping!', # extra=chrom) print 'Cannot access a file {} for {}, skipping!'\ .format(permeth_name, chrom) keepchrom = False if keepchrom: chrom_names.append(chrom) # Loop through, gather information, and print each chrom info for chrom in chrom_names: # Create methylation dictionary for chromosomal ROI roi_chrom = roi.all_hits(BedTool([(chrom, 0, 999999999)])[0]) meth_dict = utilities.nested_dict(4, str) for feature in roi_chrom: meth_dict[feature.start][feature.end]['name'] = feature.name proc_list = list(in_bed_prefixes) def worker(): """Worker for multithreading that analyzes a chromosome.""" while proc_list: pm_prefix = proc_list.pop() chrom_meth(pm_prefix, chrom, roi_chrom, mask, meth_dict) threads = [Thread(target=worker) for i in range(thread_count)] [t.start() for t in threads] [t.join() for t in threads] # Print information into table for start in sorted(meth_dict): for end in sorted(meth_dict[start]): name = meth_dict[start][end]['name'] print_line = '{}\t{}\t{}\t{}'.format(chrom, start, end, name) raw_col_line = print_line file_print_count = 0 for pm_sample in in_bed_prefixes: meth = meth_dict[start][end][pm_sample]['meth'] total = meth_dict[start][end][pm_sample]['total'] cpg = meth_dict[start][end][pm_sample]['cpg'] if total >= min_read_count and cpg >= min_cpg_count: try: float(meth) except ValueError: print "Not a float: {}".format(meth) try: float(total) except ValueError: print "Not a float: {}".format(total) meth_perc = float(meth) / float(total) print_line = '{0}\t{1:.3f}'.format( print_line, meth_perc) file_print_count += 1 else: print_line = '{0}\tNA'.format(print_line) raw_col_line = '{}\t{}\t{}\t{}'\ .format(raw_col_line, meth, total, cpg) print_line = '{}\n'.format(print_line) raw_col_line = '{}\n'.format(raw_col_line) if file_print_count >= min_file_count: outfile.write(print_line) if raw_data_name != "": raw_data.write(raw_col_line)
pass def print_split_sort_bed(self): pass split_num = 100 split_region_list = [[] * 5] print split_region_list bed = BedTool( '/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test.bed') bed = BedTool(bed.sort().merge().window_maker(b=bed.fn, w=100)) bed.all_hits() # x = BedTool().window_maker(genome='hg38', w=1000000) bed.saveas( '/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test_w100.bed') split_num = bed.count() if bed.count() < split_num else split_num print bed.count() / split_num # print bed.split(10, 'out') # print x n = 0 for region in bed:
pass def print_split_sort_bed(self): pass split_num = 100 split_region_list = [[]*5] print split_region_list bed = BedTool('/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test.bed') bed = BedTool(bed.sort().merge().window_maker(b=bed.fn, w=100)) bed.all_hits() # x = BedTool().window_maker(genome='hg38', w=1000000) bed.saveas('/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test_w100.bed') split_num = bed.count() if bed.count() < split_num else split_num print bed.count()/split_num # print bed.split(10, 'out') # print x n = 0 for region in bed: # print region.length