Esempio n. 1
0
def roi_meth(in_bed_prefixes, in_sample_list, out_table, mask_file, roi_file,
             min_read_count, min_cpg_count, min_file_count, raw_data_name,
             thread_count):
    """
    Creates a table with the methylation across desired Regions of
    Interest (ROI).

    :param in_bed_prefixes: list of bed file prefixes.
    :param in_sample_list: list of sample names. Order corresponds with
                           in_bed_prefixes.
    :param out_table: name of output table file.
    :param mask_file: bed or gtf file that will contain areas masked from
                      analysis (ie: any areas in this file will be ignored).
    :param roi_file: bed or gtf file containing the areas of the genome you
                     want analyzed.
    :param min_read_count: minimum read count necessary for a region of
                           interest. If a sample has less than this read count,
                           NA will be input instead of the average methylation
                           over the ROI.
    :param min_cpg_count: minimum CpG count necessary for a region of
                          interest. If a sample has less than this read count,
                          NA will be input instead of the average methylation
                          over the ROI.
    :param min_file_count: minimum file count to keep a region of interest. If
                           less than this many files/samples meet the
                           previous minimum requirements, that roi will not
                           have output in your out_table file.
    :param raw_data_name: optional file that (if populated) will be the output
                          of methylated and total read counts for each sample.
                          The minimums still apply and will work the same as
                          the main file.
    :param thread_count: int designating threads to allocate for multithreading.
    :return: Nothing
    """
    # Reduces thread count if there aren't enough tasks to fill all threads
    if len(in_bed_prefixes) < thread_count:
        thread_count = len(in_bed_prefixes)
    outfile = open(out_table, 'wb')
    header_line = 'chrom\tstart\tend\tname'
    for samp in in_sample_list:
        header_line = '{}\t{}'.format(header_line, samp)
    header_line = '{}\n'.format(header_line)
    outfile.write(header_line)
    if raw_data_name != "":
        raw_data = open(raw_data_name, 'wb')
        header_line = 'chrom\tstart\tend\tname'
        for samp in in_sample_list:
            header_line = '{0}\t{1}_methylated\t{1}_total\t{1}_cpgs'\
                .format(header_line, samp)
        header_line = '{}\n'.format(header_line)
        raw_data.write(header_line)

    roi = BedTool(roi_file)
    if mask_file != "":
        mask = BedTool(mask_file)
    else:
        mask = BedTool([('chrNONE', 0, 0)])

    # Get chromosome names in ROI file
    logging.info('Loading chromosomes:')
    chrom_names_tmp = []
    for line in roi:
        chrom = utilities.show_value(line.chrom)
        if chrom not in chrom_names_tmp:
            chrom_names_tmp.append(chrom)
    # Remove chromosome names without accompanying PerMeth file
    chrom_names = []
    for chrom in chrom_names_tmp:
        keepchrom = True
        for pm_sample in in_bed_prefixes:
            permeth_name = '{}{}.bed'.format(pm_sample, chrom)
            if not os.path.exists(permeth_name):
                permeth_name = '{}{}.bed.gz'.format(pm_sample, chrom)
                if not os.path.exists(permeth_name):
                    # logging.warning('Cannot access a file for {}, skipping!',
                    #                 extra=chrom)
                    print 'Cannot access a file {} for {}, skipping!'\
                        .format(permeth_name, chrom)
                    keepchrom = False
        if keepchrom:
            chrom_names.append(chrom)

    # Loop through, gather information, and print each chrom info
    for chrom in chrom_names:
        # Create methylation dictionary for chromosomal ROI
        roi_chrom = roi.all_hits(BedTool([(chrom, 0, 999999999)])[0])
        meth_dict = utilities.nested_dict(4, str)
        for feature in roi_chrom:
            meth_dict[feature.start][feature.end]['name'] = feature.name
        proc_list = list(in_bed_prefixes)

        def worker():
            """Worker for multithreading that analyzes a chromosome."""
            while proc_list:
                pm_prefix = proc_list.pop()
                chrom_meth(pm_prefix, chrom, roi_chrom, mask, meth_dict)

        threads = [Thread(target=worker) for i in range(thread_count)]
        [t.start() for t in threads]
        [t.join() for t in threads]

        # Print information into table
        for start in sorted(meth_dict):
            for end in sorted(meth_dict[start]):
                name = meth_dict[start][end]['name']
                print_line = '{}\t{}\t{}\t{}'.format(chrom, start, end, name)
                raw_col_line = print_line
                file_print_count = 0
                for pm_sample in in_bed_prefixes:
                    meth = meth_dict[start][end][pm_sample]['meth']
                    total = meth_dict[start][end][pm_sample]['total']
                    cpg = meth_dict[start][end][pm_sample]['cpg']
                    if total >= min_read_count and cpg >= min_cpg_count:
                        try:
                            float(meth)
                        except ValueError:
                            print "Not a float: {}".format(meth)
                        try:
                            float(total)
                        except ValueError:
                            print "Not a float: {}".format(total)
                        meth_perc = float(meth) / float(total)
                        print_line = '{0}\t{1:.3f}'.format(
                            print_line, meth_perc)
                        file_print_count += 1
                    else:
                        print_line = '{0}\tNA'.format(print_line)
                    raw_col_line = '{}\t{}\t{}\t{}'\
                        .format(raw_col_line, meth, total, cpg)
                print_line = '{}\n'.format(print_line)
                raw_col_line = '{}\n'.format(raw_col_line)
                if file_print_count >= min_file_count:
                    outfile.write(print_line)
                    if raw_data_name != "":
                        raw_data.write(raw_col_line)
Esempio n. 2
0
            pass

    def print_split_sort_bed(self):
        pass


split_num = 100

split_region_list = [[] * 5]
print split_region_list
bed = BedTool(
    '/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test.bed')

bed = BedTool(bed.sort().merge().window_maker(b=bed.fn, w=100))

bed.all_hits()

# x = BedTool().window_maker(genome='hg38', w=1000000)
bed.saveas(
    '/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test_w100.bed')

split_num = bed.count() if bed.count() < split_num else split_num

print bed.count() / split_num

# print bed.split(10, 'out')

# print x

n = 0
for region in bed:
Esempio n. 3
0
            pass


    def print_split_sort_bed(self):
        pass

split_num = 100

split_region_list = [[]*5]
print split_region_list
bed = BedTool('/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test.bed')


bed = BedTool(bed.sort().merge().window_maker(b=bed.fn, w=100))

bed.all_hits()

# x = BedTool().window_maker(genome='hg38', w=1000000)
bed.saveas('/Users/huangzhibo/workitems/10.testData/testPlatformTJ/bed/test_w100.bed')

split_num = bed.count() if bed.count() < split_num else split_num

print bed.count()/split_num

# print bed.split(10, 'out')

# print x

n = 0
for region in bed:
    # print region.length