def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script calculates the overlap, in base pairs, of two bed files. " "It also calculates the number of base pairs which are unique to each bed file." ) parser.add_argument('bed_1', help="The first BED12+ file") parser.add_argument('bed_2', help="The second BED12+ file") parser.add_argument('--num-cpus', help="The number of CPUs to use when counting the " "BED12 feature lengths", type=int, default=default_num_cpus) args = parser.parse_args() programs = ['subtractBed'] utils.check_programs_exist(programs) # read in the files and convert for use in bedtools bed_1_df = bed_utils.read_bed(args.bed_1) bed_2_df = bed_utils.read_bed(args.bed_2) bed_1 = pybedtools.BedTool.from_dataframe( bed_1_df[bed_utils.bed12_field_names]) bed_2 = pybedtools.BedTool.from_dataframe( bed_2_df[bed_utils.bed12_field_names]) # first, the bases unique to bed_1 bed_1_only = bed_1.subtract(bed_2, split=True) bed_1_only_df = bed_1_only.to_dataframe(names=bed_utils.bed12_field_names) bed_1_only_sizes = parallel.apply_parallel( bed_1_only_df, args.num_cpus, bed_utils.get_bed_12_feature_length) bed_1_only_coverage = np.sum(bed_1_only_sizes) # now the bed_2 unique bases bed_2_only = bed_2.subtract(bed_1, split=True) bed_2_only_df = bed_2_only.to_dataframe(names=bio.bed12_field_names) bed_2_only_sizes = parallel.apply_parallel( bed_2_only_df, args.num_cpus, bed_utils.get_bed_12_feature_length) bed_2_only_coverage = np.sum(bed_2_only_sizes) # and the overlap bed_1_and_2 = bed_1.intersect(bed_2, split=True) bed_1_and_2_df = bed_1_and_2.to_dataframe( names=bed_utils.bed12_field_names) bed_1_and_2_sizes = parallel.apply_parallel( bed_1_and_2_df, args.num_cpus, bed_utils.get_bed_12_feature_length) bed_1_and_2_coverage = np.sum(bed_1_and_2_sizes) print("{} unique bases: {}".format(args.bed_1, bed_1_only_coverage)) print("{} unique bases: {}".format(args.bed_2, bed_2_only_coverage)) print("Overlapping bases: {}".format(bed_1_and_2_coverage))
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script replaces all score and color values in bed files with " "'0'. It applies this to all bed files in the current directory.") parser.add_argument('--no-ask', help="By default, the program will ask to replace " "the values for each bed file. If this flag is given, then the asking will be " "skipped.", action='store_true') parser.add_argument('--bed-extensions', help="The extensions to treat as " "bed files", nargs='+', default=default_bed_extensions) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) ask = not args.no_ask for bed_extension in args.bed_extensions: re = "*{}".format(bed_extension) bed_files = glob.glob(re) for bed_file in bed_files: print("fix: {}".format(bed_file)) if (not ask) or fix_bed(bed_file): bed = bed_utils.read_bed(bed_file) bed['score'] = 0 bed['color'] = 0 bed_utils.write_bed(bed, bed_file)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script splits BED12+ files into a BED6+ file. Each block " "(i.e., exon) in the original file is an individual feature in the new file. " "There are two extra fields, exon_index and transcript_start, which give the " "index of the exon within its transcript and the start of the exon in the " "\"spliced\" version of the transcript. The \"id\" column in the original file " "is used as the \"id\" in the new file, so the exons can easily be grouped.") parser.add_argument('bed', help="The BED12+ file") parser.add_argument('out', help="The output BED6+2 file") parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use", type=int, default=default_num_cpus) parser.add_argument('--num-groups', help="The number of groups to split the " "bed file into for parallelization", type=int, default=default_num_groups) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading BED12+ file" logger.info(msg) bed = bed_utils.read_bed(args.bed) msg = "Splitting blocks" logger.info(msg) exons = parallel.apply_parallel_split( bed, args.num_cpus, #bio.split_bed12_blocks, split_all_blocks, progress_bar=True, num_groups = args.num_groups ) msg = "Merging exons into a data frame" logger.info(msg) #exons = utils.flatten_lists(exons) #exons = pd.DataFrame(exons) exons = pd.concat(exons) fields = bed_utils.bed6_field_names + ['exon_index', 'transcript_start'] exons = exons[fields] msg = "Writing BED6+2 file" logger.info(msg) bed_utils.write_bed(exons, args.out)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Remove duplicate entries from a list of bed12+ files. " "Write the non-redundant entries to a new file. Precedence among " "duplicates is arbitrary.") parser.add_argument('bed', help="The input bed file(s)", nargs='+') parser.add_argument('-o', '--out', help="The output bed(.gz) file", required=True) parser.add_argument('--compress', help="If this flag is given, the output " "will be gzipped. The output filename *will not* be changed (so it " "should already end in \".gz\").", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading bed files" logger.info(msg) all_bed = [ bed_utils.read_bed(b) for b in args.bed ] for f, b in zip(args.bed, all_bed): msg = "{}. number of entities: {}".format(f, len(b)) logger.debug(msg) msg = "Concatenating bed entries" logger.info(msg) all_bed_df = pd.concat(all_bed) msg = "Removing duplicate entries" logger.info(msg) all_bed_df = all_bed_df.drop_duplicates(subset=DUPLICATE_FIELDS) msg = "number of non-redundant entries: {}".format(len(all_bed_df)) logger.debug(msg) msg = "Sorting non-redundant entries" logger.info(msg) sort_fields = ['seqname', 'start', 'end', 'strand'] all_bed_df = all_bed_df.sort_values(by=sort_fields) msg = "Writing sorted, non-redundant entries to disk" logger.info(msg) bed_utils.write_bed(all_bed_df, args.out, compress=args.compress)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script removes all of the entries from A which overlap " "any of the entries from B. Optionally, some minimum overlap can be " "given, in terms of overlap fraction.") parser.add_argument('bed_a', help="The bed file from which entries will be " "removed") parser.add_argument('bed_b', help="The bed file used to find entries in A " "to remove") parser.add_argument('out', help="The output (bed.gz) file") parser.add_argument( '--min-a-overlap', help="A minimum fraction required " "for overlap of the A entries. \"0\" means \"at least one bp.\"", type=float, default=default_min_a_overlap) parser.add_argument( '--min-b-overlap', help="A minimum fraction required " "for overlap of the B entries. \"0\" means \"at least one bp.\"", type=float, default=default_min_b_overlap) parser.add_argument( '--split', help="If this flag is given, then the bed " "entries in both files will be split. This can be somewhat slow, " "depending on the number of entries in the files.", action='store_true') parser.add_argument( '--exons', help="If the bed entries have already been " "split and the exon bed6+2 file (from split-bed12-blocks program) is " "available, then that can be given with this option. The exons from " "that file will be used for both A and B.", default=None) parser.add_argument('--exons-a', help="As with the --exons argument, but " "these exons will only be used for A", default=None) parser.add_argument('--exons-b', help="As with the --exons argument, but " "these exons will only be used for B", default=None) parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use for " "certain parts of the script", type=int, default=default_num_cpus) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) exons_given = args.exons is not None exons_a_given = args.exons_a is not None exons_b_given = args.exons_b is not None # check if the exons files exist if exons_given and (not os.path.exists(args.exons)): msg = "The exons file does not exist: {}".format(args.exons) raise FileNotFoundError(msg) if exons_a_given and (not os.path.exists(args.exons_a)): msg = "The exons_a file does not exist: {}".format(args.exons_a) raise FileNotFoundError(msg) if exons_b_given and (not os.path.exists(args.exons_b)): msg = "The exons_b file does not exist: {}".format(args.exons_b) raise FileNotFoundError(msg) exons_a_only = exons_a_given and not exons_b_given exons_b_only = not exons_a_given and exons_b_given if exons_a_only or exons_b_only: msg = ("Only one of --exons-a, --exons-b was given. This is valid, " "but please ensure this is the desired behavior.") logger.warning(msg) # make sure we weren't given contradictory flags if args.split and exons_given: msg = "Both --split and --exons were given. Only one of these is allowed." raise ValueError(msg) if exons_given and (exons_a_given or exons_b_given): msg = ( "Both --exons and (--exons-a or --exons-b) were given. --exons " "should not be given with the --exons-a and --exons-b arguments.") raise ValueError(msg) exons = None exons_a = None exons_b = None msg = "Reading bed A" logger.info(msg) bed_a = bed_utils.read_bed(args.bed_a) msg = "Reading bed B" logger.info(msg) bed_b = bed_utils.read_bed(args.bed_b) if args.split: msg = "Splitting bed A" logger.info(msg) exons_a = parallel.apply_parallel_split(bed_a, args.num_cpus, split_all_blocks, progress_bar=True, num_groups=args.num_groups) exons_a = pd.concat(exons_a) msg = "Splitting bed B" logger.info(msg) exons_b = parallel.apply_parallel_split(bed_b, args.num_cpus, split_all_blocks, progress_bar=True, num_groups=args.num_groups) exons_b = pd.concat(exons_b) if exons_given: msg = "Reading exons" logger.info(msg) exons = bed_utils.read_bed(args.exons) if exons_a_given: msg = "Reading A exons" logger.info(msg) exons_a = bed_utils.read_bed(args.exons_a) if exons_b_given: msg = "Reading B exons" logger.info(msg) exons_b = bed_utils.read_bed(args.exons_b) msg = "Finding all A entries which overlap B entries" logger.info(msg) remaining_a_ids = bed_utils.subtract_bed(bed_a, bed_b, min_a_overlap=args.min_a_overlap, min_b_overlap=args.min_b_overlap, exons=exons, exons_a=exons_a, exons_b=exons_b) msg = "Filtering the A entries which had overlaps" logger.info(msg) m_remaining = bed_a['id'].isin(remaining_a_ids) bed_a_remaining = bed_a[m_remaining] msg = "Writing remaining A entries to disk" logger.info(msg) bed_utils.write_bed(bed_a_remaining, args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Convert a bed12 file into an equivalent gtf file. In " "particular, it uses the \"thick_start\" and \"thick_end\" fields to " "determine the CDS gtf entries. It only creates \"exon\" and \"CDS\" " "gtf entries. The bed columns after the standard 12 are included as " "attributes in the gtf file.") parser.add_argument('bed', help="The bed12 file. It must conform to the " "style expected by lifesci.bed_utils.") parser.add_argument('out', help="The (output) gtf file. It will conform " "to the style dictated by lifesci.gtf_utils.") parser.add_argument('-s', '--source', help="The name to use for the " "\"source\" column in the gtf file", default=default_source) parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use " "for conversion", type=int, default=default_num_cpus) parser.add_argument( '--add-gene-id', help="If this flag is present, then " "the \"id\" field will also be used as the \"gene_id\"", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading bed file" logger.info(msg) bed = bed_utils.read_bed(args.bed) if args.add_gene_id: msg = "Adding gene_id" logger.info(msg) bed['gene_id'] = bed['id'] msg = "Expanding bed entries to gtf entries" logger.info(msg) gtf_entries = parallel.apply_parallel(bed, args.num_cpus, gtf_utils.get_gtf_entries, args.source, progress_bar=True) msg = "Joining gtf entries into large data frame" logger.info(msg) gtf_entries = pd.concat(gtf_entries) msg = "Sorting gtf entries" logger.info(msg) gtf_entries = gtf_entries.sort_values(['seqname', 'start', 'end']) gtf_entries = gtf_entries.reset_index(drop=True) msg = "Writing gtf to disk" logger.info(msg) gtf_utils.write_gtf(gtf_entries, args.out, compress=False)