Python read_bed Examples, lifesci.bed_utils.read_bed Python Examples

Example #1

0

Show file

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script calculates the overlap, in base pairs, of two bed files. "
        "It also calculates the number of base pairs which are unique to each bed file."
    )

    parser.add_argument('bed_1', help="The first BED12+ file")
    parser.add_argument('bed_2', help="The second BED12+ file")

    parser.add_argument('--num-cpus',
                        help="The number of CPUs to use when counting the "
                        "BED12 feature lengths",
                        type=int,
                        default=default_num_cpus)

    args = parser.parse_args()

    programs = ['subtractBed']
    utils.check_programs_exist(programs)

    # read in the files and convert for use in bedtools
    bed_1_df = bed_utils.read_bed(args.bed_1)
    bed_2_df = bed_utils.read_bed(args.bed_2)

    bed_1 = pybedtools.BedTool.from_dataframe(
        bed_1_df[bed_utils.bed12_field_names])
    bed_2 = pybedtools.BedTool.from_dataframe(
        bed_2_df[bed_utils.bed12_field_names])

    # first, the bases unique to bed_1
    bed_1_only = bed_1.subtract(bed_2, split=True)
    bed_1_only_df = bed_1_only.to_dataframe(names=bed_utils.bed12_field_names)
    bed_1_only_sizes = parallel.apply_parallel(
        bed_1_only_df, args.num_cpus, bed_utils.get_bed_12_feature_length)

    bed_1_only_coverage = np.sum(bed_1_only_sizes)

    # now the bed_2 unique bases
    bed_2_only = bed_2.subtract(bed_1, split=True)
    bed_2_only_df = bed_2_only.to_dataframe(names=bio.bed12_field_names)
    bed_2_only_sizes = parallel.apply_parallel(
        bed_2_only_df, args.num_cpus, bed_utils.get_bed_12_feature_length)

    bed_2_only_coverage = np.sum(bed_2_only_sizes)

    # and the overlap
    bed_1_and_2 = bed_1.intersect(bed_2, split=True)
    bed_1_and_2_df = bed_1_and_2.to_dataframe(
        names=bed_utils.bed12_field_names)
    bed_1_and_2_sizes = parallel.apply_parallel(
        bed_1_and_2_df, args.num_cpus, bed_utils.get_bed_12_feature_length)

    bed_1_and_2_coverage = np.sum(bed_1_and_2_sizes)

    print("{} unique bases: {}".format(args.bed_1, bed_1_only_coverage))
    print("{} unique bases: {}".format(args.bed_2, bed_2_only_coverage))
    print("Overlapping bases: {}".format(bed_1_and_2_coverage))

Example #2

0

Show file

File: fix_all_bed_files.py Project: bmmalone/pylifesci

def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script replaces all score and color values in bed files with "
        "'0'. It applies this to all bed files in the current directory.")

    parser.add_argument('--no-ask', help="By default, the program will ask to replace "
        "the values for each bed file. If this flag is given, then the asking will be "
        "skipped.", action='store_true')

    parser.add_argument('--bed-extensions', help="The extensions to treat as "
        "bed files", nargs='+', default=default_bed_extensions)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    ask = not args.no_ask

    for bed_extension in args.bed_extensions:
        re = "*{}".format(bed_extension)
        bed_files = glob.glob(re)

        for bed_file in bed_files:
            print("fix: {}".format(bed_file))
            if (not ask) or fix_bed(bed_file):
                bed = bed_utils.read_bed(bed_file)
                bed['score'] = 0
                bed['color'] = 0

                bed_utils.write_bed(bed, bed_file)

Example #3

0

Show file

File: split_bed12_blocks.py Project: bmmalone/pylifesci

def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script splits BED12+ files into a BED6+ file. Each block "
        "(i.e., exon) in the original file is an individual feature in the new file. "
        "There are two extra fields, exon_index and transcript_start, which give the "
        "index of the exon within its transcript and the start of the exon in the "
        "\"spliced\" version of the transcript. The \"id\" column in the original file "
        "is used as the \"id\" in the new file, so the exons can easily be grouped.")

    parser.add_argument('bed', help="The BED12+ file")
    parser.add_argument('out', help="The output BED6+2 file")

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use",
        type=int, default=default_num_cpus)

    parser.add_argument('--num-groups', help="The number of groups to split the "
        "bed file into for parallelization", type=int, default=default_num_groups)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)
    
    msg = "Reading BED12+ file"
    logger.info(msg)

    bed = bed_utils.read_bed(args.bed)

    msg = "Splitting blocks"
    logger.info(msg)

    exons = parallel.apply_parallel_split(
        bed,
        args.num_cpus,
        #bio.split_bed12_blocks,
        split_all_blocks,
        progress_bar=True,
        num_groups = args.num_groups
    )

    msg = "Merging exons into a data frame"
    logger.info(msg)

    #exons = utils.flatten_lists(exons)
    #exons = pd.DataFrame(exons)
    exons = pd.concat(exons)

    fields = bed_utils.bed6_field_names + ['exon_index', 'transcript_start']
    exons = exons[fields]

    msg = "Writing BED6+2 file"
    logger.info(msg)

    bed_utils.write_bed(exons, args.out)

Example #4

0

Show file

File: remove_duplicate_bed_entries.py Project: bmmalone/pylifesci

def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Remove duplicate entries from a list of bed12+ files. "
        "Write the non-redundant entries to a new file. Precedence among "
        "duplicates is arbitrary.")

    parser.add_argument('bed', help="The input bed file(s)", nargs='+')
    parser.add_argument('-o', '--out', help="The output bed(.gz) file",
        required=True)

    parser.add_argument('--compress', help="If this flag is given, the output "
        "will be gzipped. The output filename *will not* be changed (so it "
        "should already end in \".gz\").", action='store_true')
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading bed files"
    logger.info(msg)

    all_bed = [
        bed_utils.read_bed(b) for b in args.bed
    ]

    for f, b in zip(args.bed, all_bed):
        msg = "{}. number of entities: {}".format(f, len(b))
        logger.debug(msg)

    msg = "Concatenating bed entries"
    logger.info(msg)
    all_bed_df = pd.concat(all_bed)

    msg = "Removing duplicate entries"
    logger.info(msg)
    all_bed_df = all_bed_df.drop_duplicates(subset=DUPLICATE_FIELDS)

    msg = "number of non-redundant entries: {}".format(len(all_bed_df))
    logger.debug(msg)

    msg = "Sorting non-redundant entries"
    logger.info(msg)
    sort_fields = ['seqname', 'start', 'end', 'strand']
    all_bed_df = all_bed_df.sort_values(by=sort_fields)

    msg = "Writing sorted, non-redundant entries to disk"
    logger.info(msg)
    bed_utils.write_bed(all_bed_df, args.out, compress=args.compress)

Example #5

0

Show file

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script removes all of the entries from A which overlap "
        "any of the entries from B. Optionally, some minimum overlap can be "
        "given, in terms of overlap fraction.")

    parser.add_argument('bed_a',
                        help="The bed file from which entries will be "
                        "removed")
    parser.add_argument('bed_b',
                        help="The bed file used to find entries in A "
                        "to remove")

    parser.add_argument('out', help="The output (bed.gz) file")

    parser.add_argument(
        '--min-a-overlap',
        help="A minimum fraction required "
        "for overlap of the A entries. \"0\" means \"at least one bp.\"",
        type=float,
        default=default_min_a_overlap)

    parser.add_argument(
        '--min-b-overlap',
        help="A minimum fraction required "
        "for overlap of the B entries. \"0\" means \"at least one bp.\"",
        type=float,
        default=default_min_b_overlap)

    parser.add_argument(
        '--split',
        help="If this flag is given, then the bed "
        "entries in both files will be split. This can be somewhat slow, "
        "depending on the number of entries in the files.",
        action='store_true')

    parser.add_argument(
        '--exons',
        help="If the bed entries have already been "
        "split and the exon bed6+2 file (from split-bed12-blocks program) is "
        "available, then that can be given with this option. The exons from "
        "that file will be used for both A and B.",
        default=None)

    parser.add_argument('--exons-a',
                        help="As with the --exons argument, but "
                        "these exons will only be used for A",
                        default=None)

    parser.add_argument('--exons-b',
                        help="As with the --exons argument, but "
                        "these exons will only be used for B",
                        default=None)

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use for "
                        "certain parts of the script",
                        type=int,
                        default=default_num_cpus)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    exons_given = args.exons is not None
    exons_a_given = args.exons_a is not None
    exons_b_given = args.exons_b is not None

    # check if the exons files exist
    if exons_given and (not os.path.exists(args.exons)):
        msg = "The exons file does not exist: {}".format(args.exons)
        raise FileNotFoundError(msg)

    if exons_a_given and (not os.path.exists(args.exons_a)):
        msg = "The exons_a file does not exist: {}".format(args.exons_a)
        raise FileNotFoundError(msg)

    if exons_b_given and (not os.path.exists(args.exons_b)):
        msg = "The exons_b file does not exist: {}".format(args.exons_b)
        raise FileNotFoundError(msg)

    exons_a_only = exons_a_given and not exons_b_given
    exons_b_only = not exons_a_given and exons_b_given
    if exons_a_only or exons_b_only:
        msg = ("Only one of --exons-a, --exons-b was given. This is valid, "
               "but please ensure this is the desired behavior.")
        logger.warning(msg)

    # make sure we weren't given contradictory flags
    if args.split and exons_given:
        msg = "Both --split and --exons were given. Only one of these is allowed."
        raise ValueError(msg)

    if exons_given and (exons_a_given or exons_b_given):
        msg = (
            "Both --exons and (--exons-a or --exons-b) were given. --exons "
            "should not be given with the --exons-a and --exons-b arguments.")
        raise ValueError(msg)

    exons = None
    exons_a = None
    exons_b = None

    msg = "Reading bed A"
    logger.info(msg)
    bed_a = bed_utils.read_bed(args.bed_a)

    msg = "Reading bed B"
    logger.info(msg)
    bed_b = bed_utils.read_bed(args.bed_b)

    if args.split:
        msg = "Splitting bed A"
        logger.info(msg)

        exons_a = parallel.apply_parallel_split(bed_a,
                                                args.num_cpus,
                                                split_all_blocks,
                                                progress_bar=True,
                                                num_groups=args.num_groups)

        exons_a = pd.concat(exons_a)

        msg = "Splitting bed B"
        logger.info(msg)

        exons_b = parallel.apply_parallel_split(bed_b,
                                                args.num_cpus,
                                                split_all_blocks,
                                                progress_bar=True,
                                                num_groups=args.num_groups)

        exons_b = pd.concat(exons_b)

    if exons_given:
        msg = "Reading exons"
        logger.info(msg)
        exons = bed_utils.read_bed(args.exons)

    if exons_a_given:
        msg = "Reading A exons"
        logger.info(msg)
        exons_a = bed_utils.read_bed(args.exons_a)

    if exons_b_given:
        msg = "Reading B exons"
        logger.info(msg)
        exons_b = bed_utils.read_bed(args.exons_b)

    msg = "Finding all A entries which overlap B entries"
    logger.info(msg)

    remaining_a_ids = bed_utils.subtract_bed(bed_a,
                                             bed_b,
                                             min_a_overlap=args.min_a_overlap,
                                             min_b_overlap=args.min_b_overlap,
                                             exons=exons,
                                             exons_a=exons_a,
                                             exons_b=exons_b)

    msg = "Filtering the A entries which had overlaps"
    logger.info(msg)

    m_remaining = bed_a['id'].isin(remaining_a_ids)
    bed_a_remaining = bed_a[m_remaining]

    msg = "Writing remaining A entries to disk"
    logger.info(msg)

    bed_utils.write_bed(bed_a_remaining, args.out)

Example #6

0

Show file

File: bed12_to_gtf.py Project: bmmalone/pylifesci

def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Convert a bed12 file into an equivalent gtf file. In "
        "particular, it uses the \"thick_start\" and \"thick_end\" fields to "
        "determine the CDS gtf entries. It only creates \"exon\" and \"CDS\" "
        "gtf entries. The bed columns after the standard 12 are included as "
        "attributes in the gtf file.")

    parser.add_argument('bed',
                        help="The bed12 file. It must conform to the "
                        "style expected by lifesci.bed_utils.")
    parser.add_argument('out',
                        help="The (output) gtf file. It will conform "
                        "to the style dictated by lifesci.gtf_utils.")

    parser.add_argument('-s',
                        '--source',
                        help="The name to use for the "
                        "\"source\" column in the gtf file",
                        default=default_source)

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use "
                        "for conversion",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument(
        '--add-gene-id',
        help="If this flag is present, then "
        "the \"id\" field will also be used as the \"gene_id\"",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading bed file"
    logger.info(msg)
    bed = bed_utils.read_bed(args.bed)

    if args.add_gene_id:
        msg = "Adding gene_id"
        logger.info(msg)
        bed['gene_id'] = bed['id']

    msg = "Expanding bed entries to gtf entries"
    logger.info(msg)

    gtf_entries = parallel.apply_parallel(bed,
                                          args.num_cpus,
                                          gtf_utils.get_gtf_entries,
                                          args.source,
                                          progress_bar=True)

    msg = "Joining gtf entries into large data frame"
    logger.info(msg)

    gtf_entries = pd.concat(gtf_entries)

    msg = "Sorting gtf entries"
    logger.info(msg)

    gtf_entries = gtf_entries.sort_values(['seqname', 'start', 'end'])
    gtf_entries = gtf_entries.reset_index(drop=True)

    msg = "Writing gtf to disk"
    logger.info(msg)
    gtf_utils.write_gtf(gtf_entries, args.out, compress=False)