Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script replaces all score and color values in bed files with "
        "'0'. It applies this to all bed files in the current directory.")

    parser.add_argument('--no-ask', help="By default, the program will ask to replace "
        "the values for each bed file. If this flag is given, then the asking will be "
        "skipped.", action='store_true')

    parser.add_argument('--bed-extensions', help="The extensions to treat as "
        "bed files", nargs='+', default=default_bed_extensions)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    ask = not args.no_ask

    for bed_extension in args.bed_extensions:
        re = "*{}".format(bed_extension)
        bed_files = glob.glob(re)

        for bed_file in bed_files:
            print("fix: {}".format(bed_file))
            if (not ask) or fix_bed(bed_file):
                bed = bed_utils.read_bed(bed_file)
                bed['score'] = 0
                bed['color'] = 0

                bed_utils.write_bed(bed, bed_file)
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script splits BED12+ files into a BED6+ file. Each block "
        "(i.e., exon) in the original file is an individual feature in the new file. "
        "There are two extra fields, exon_index and transcript_start, which give the "
        "index of the exon within its transcript and the start of the exon in the "
        "\"spliced\" version of the transcript. The \"id\" column in the original file "
        "is used as the \"id\" in the new file, so the exons can easily be grouped.")

    parser.add_argument('bed', help="The BED12+ file")
    parser.add_argument('out', help="The output BED6+2 file")

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use",
        type=int, default=default_num_cpus)

    parser.add_argument('--num-groups', help="The number of groups to split the "
        "bed file into for parallelization", type=int, default=default_num_groups)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)
    
    msg = "Reading BED12+ file"
    logger.info(msg)

    bed = bed_utils.read_bed(args.bed)

    msg = "Splitting blocks"
    logger.info(msg)

    exons = parallel.apply_parallel_split(
        bed,
        args.num_cpus,
        #bio.split_bed12_blocks,
        split_all_blocks,
        progress_bar=True,
        num_groups = args.num_groups
    )

    msg = "Merging exons into a data frame"
    logger.info(msg)

    #exons = utils.flatten_lists(exons)
    #exons = pd.DataFrame(exons)
    exons = pd.concat(exons)

    fields = bed_utils.bed6_field_names + ['exon_index', 'transcript_start']
    exons = exons[fields]

    msg = "Writing BED6+2 file"
    logger.info(msg)

    bed_utils.write_bed(exons, args.out)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Remove duplicate entries from a list of bed12+ files. "
        "Write the non-redundant entries to a new file. Precedence among "
        "duplicates is arbitrary.")

    parser.add_argument('bed', help="The input bed file(s)", nargs='+')
    parser.add_argument('-o', '--out', help="The output bed(.gz) file",
        required=True)

    parser.add_argument('--compress', help="If this flag is given, the output "
        "will be gzipped. The output filename *will not* be changed (so it "
        "should already end in \".gz\").", action='store_true')
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading bed files"
    logger.info(msg)

    all_bed = [
        bed_utils.read_bed(b) for b in args.bed
    ]

    for f, b in zip(args.bed, all_bed):
        msg = "{}. number of entities: {}".format(f, len(b))
        logger.debug(msg)

    msg = "Concatenating bed entries"
    logger.info(msg)
    all_bed_df = pd.concat(all_bed)

    msg = "Removing duplicate entries"
    logger.info(msg)
    all_bed_df = all_bed_df.drop_duplicates(subset=DUPLICATE_FIELDS)

    msg = "number of non-redundant entries: {}".format(len(all_bed_df))
    logger.debug(msg)

    msg = "Sorting non-redundant entries"
    logger.info(msg)
    sort_fields = ['seqname', 'start', 'end', 'strand']
    all_bed_df = all_bed_df.sort_values(by=sort_fields)

    msg = "Writing sorted, non-redundant entries to disk"
    logger.info(msg)
    bed_utils.write_bed(all_bed_df, args.out, compress=args.compress)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script removes all of the entries from A which overlap "
        "any of the entries from B. Optionally, some minimum overlap can be "
        "given, in terms of overlap fraction.")

    parser.add_argument('bed_a',
                        help="The bed file from which entries will be "
                        "removed")
    parser.add_argument('bed_b',
                        help="The bed file used to find entries in A "
                        "to remove")

    parser.add_argument('out', help="The output (bed.gz) file")

    parser.add_argument(
        '--min-a-overlap',
        help="A minimum fraction required "
        "for overlap of the A entries. \"0\" means \"at least one bp.\"",
        type=float,
        default=default_min_a_overlap)

    parser.add_argument(
        '--min-b-overlap',
        help="A minimum fraction required "
        "for overlap of the B entries. \"0\" means \"at least one bp.\"",
        type=float,
        default=default_min_b_overlap)

    parser.add_argument(
        '--split',
        help="If this flag is given, then the bed "
        "entries in both files will be split. This can be somewhat slow, "
        "depending on the number of entries in the files.",
        action='store_true')

    parser.add_argument(
        '--exons',
        help="If the bed entries have already been "
        "split and the exon bed6+2 file (from split-bed12-blocks program) is "
        "available, then that can be given with this option. The exons from "
        "that file will be used for both A and B.",
        default=None)

    parser.add_argument('--exons-a',
                        help="As with the --exons argument, but "
                        "these exons will only be used for A",
                        default=None)

    parser.add_argument('--exons-b',
                        help="As with the --exons argument, but "
                        "these exons will only be used for B",
                        default=None)

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use for "
                        "certain parts of the script",
                        type=int,
                        default=default_num_cpus)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    exons_given = args.exons is not None
    exons_a_given = args.exons_a is not None
    exons_b_given = args.exons_b is not None

    # check if the exons files exist
    if exons_given and (not os.path.exists(args.exons)):
        msg = "The exons file does not exist: {}".format(args.exons)
        raise FileNotFoundError(msg)

    if exons_a_given and (not os.path.exists(args.exons_a)):
        msg = "The exons_a file does not exist: {}".format(args.exons_a)
        raise FileNotFoundError(msg)

    if exons_b_given and (not os.path.exists(args.exons_b)):
        msg = "The exons_b file does not exist: {}".format(args.exons_b)
        raise FileNotFoundError(msg)

    exons_a_only = exons_a_given and not exons_b_given
    exons_b_only = not exons_a_given and exons_b_given
    if exons_a_only or exons_b_only:
        msg = ("Only one of --exons-a, --exons-b was given. This is valid, "
               "but please ensure this is the desired behavior.")
        logger.warning(msg)

    # make sure we weren't given contradictory flags
    if args.split and exons_given:
        msg = "Both --split and --exons were given. Only one of these is allowed."
        raise ValueError(msg)

    if exons_given and (exons_a_given or exons_b_given):
        msg = (
            "Both --exons and (--exons-a or --exons-b) were given. --exons "
            "should not be given with the --exons-a and --exons-b arguments.")
        raise ValueError(msg)

    exons = None
    exons_a = None
    exons_b = None

    msg = "Reading bed A"
    logger.info(msg)
    bed_a = bed_utils.read_bed(args.bed_a)

    msg = "Reading bed B"
    logger.info(msg)
    bed_b = bed_utils.read_bed(args.bed_b)

    if args.split:
        msg = "Splitting bed A"
        logger.info(msg)

        exons_a = parallel.apply_parallel_split(bed_a,
                                                args.num_cpus,
                                                split_all_blocks,
                                                progress_bar=True,
                                                num_groups=args.num_groups)

        exons_a = pd.concat(exons_a)

        msg = "Splitting bed B"
        logger.info(msg)

        exons_b = parallel.apply_parallel_split(bed_b,
                                                args.num_cpus,
                                                split_all_blocks,
                                                progress_bar=True,
                                                num_groups=args.num_groups)

        exons_b = pd.concat(exons_b)

    if exons_given:
        msg = "Reading exons"
        logger.info(msg)
        exons = bed_utils.read_bed(args.exons)

    if exons_a_given:
        msg = "Reading A exons"
        logger.info(msg)
        exons_a = bed_utils.read_bed(args.exons_a)

    if exons_b_given:
        msg = "Reading B exons"
        logger.info(msg)
        exons_b = bed_utils.read_bed(args.exons_b)

    msg = "Finding all A entries which overlap B entries"
    logger.info(msg)

    remaining_a_ids = bed_utils.subtract_bed(bed_a,
                                             bed_b,
                                             min_a_overlap=args.min_a_overlap,
                                             min_b_overlap=args.min_b_overlap,
                                             exons=exons,
                                             exons_a=exons_a,
                                             exons_b=exons_b)

    msg = "Filtering the A entries which had overlaps"
    logger.info(msg)

    m_remaining = bed_a['id'].isin(remaining_a_ids)
    bed_a_remaining = bed_a[m_remaining]

    msg = "Writing remaining A entries to disk"
    logger.info(msg)

    bed_utils.write_bed(bed_a_remaining, args.out)
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script converts a GTF file to a BED12 file. In particular, "
        "it creates bed entries based on the exon features and transcript_id field. "
        "It uses the CDS regions to determine the \"thick_start\" and \"thick_end\" "
        "features of the BED12 file.")

    parser.add_argument('gtf', help="The GTF file")
    parser.add_argument('out', help="The (output) BED12 file")

    parser.add_argument(
        '--chr-name-file',
        help="If this file is given, then the "
        "bed entries will be sorted according to the order of seqnames in this "
        "file. Presumably, this is the chrName.txt file from STAR.",
        default=default_chr_name_file)

    parser.add_argument('--exon-feature',
                        help="The name of features which are "
                        "treated as exons",
                        default=default_exon_feature)
    parser.add_argument('--cds-feature',
                        help="The name of features which are "
                        "treated as CDSs",
                        default=default_cds_feature)

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use",
                        type=int,
                        default=default_num_cpus)
    parser.add_argument('-g',
                        '--num-groups',
                        help="The number of groups to split "
                        "into for parallelization",
                        type=int,
                        default=default_num_groups)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading GTF file"
    logger.info(msg)

    gtf = gtf_utils.read_gtf(args.gtf)

    msg = "Extracting exon and CDS features"
    logger.info(msg)

    m_exons = gtf['feature'] == args.exon_feature
    m_cds = gtf['feature'] == args.cds_feature

    exons = gtf[m_exons].copy()
    cds_df = gtf[m_cds].copy()

    msg = "Extracting CDS transcript ids"
    logger.info(msg)

    cds_transcript_ids = parallel.apply_parallel_split(
        cds_df,
        args.num_cpus,
        get_transcript_ids,
        progress_bar=True,
        num_groups=args.num_groups)
    cds_transcript_ids = collection_utils.flatten_lists(cds_transcript_ids)
    cds_df['transcript_id'] = cds_transcript_ids

    msg = "Calculating CDS genomic start and end positions"
    logger.info(msg)

    cds_groups = cds_df.groupby('transcript_id')

    # we subtract 1 from start because gtf is 1-based
    cds_min_starts = cds_groups['start'].min()
    cds_start_df = pd.DataFrame()
    cds_start_df['id'] = cds_min_starts.index
    cds_start_df['cds_start'] = cds_min_starts.values - 1

    # we do not subtract 1 from end because bed is "open" on the end
    cds_max_end = cds_groups['end'].max()
    cds_end_df = pd.DataFrame()
    cds_end_df['id'] = cds_max_end.index
    cds_end_df['cds_end'] = cds_max_end.values

    msg = "Extracting exon transcript ids"
    logger.info(msg)

    exon_transcript_ids = parallel.apply_parallel_split(
        exons,
        args.num_cpus,
        get_transcript_ids,
        progress_bar=True,
        num_groups=args.num_groups)
    exon_transcript_ids = collection_utils.flatten_lists(exon_transcript_ids)
    exons['transcript_id'] = exon_transcript_ids

    exons['length'] = exons['end'] - exons['start'] + 1
    exons['length'] = exons['length'].astype(str)

    # store these for sorting later
    transcript_ids = np.array(exons['transcript_id'])

    msg = "Combining exons into BED12 entries"
    logger.info(msg)

    exons = exons.sort_values('start')
    exon_groups = exons.groupby('transcript_id')

    bed12_df = parallel.apply_parallel_groups(exon_groups,
                                              args.num_cpus,
                                              get_bed12_entry,
                                              progress_bar=True)
    bed12_df = pd.DataFrame(bed12_df)

    msg = "Joining BED12 entries to CDS information"
    logger.info(msg)

    bed12_df = bed12_df.merge(cds_start_df, on='id', how='left')
    bed12_df = bed12_df.merge(cds_end_df, on='id', how='left')

    bed12_df = bed12_df.fillna(-1)

    bed12_df['thick_start'] = bed12_df['cds_start'].astype(int)
    bed12_df['thick_end'] = bed12_df['cds_end'].astype(int)

    msg = "Sorting BED12 entries"
    logger.info(msg)

    # We will break ties among transcripts by the order they appear
    # in the GTF file. This is the same way star breaks ties.
    bed12_df = bed_utils.sort(bed12_df,
                              seqname_order=args.chr_name_file,
                              transcript_ids=transcript_ids)

    msg = "Writing BED12 to disk"
    logger.info(msg)

    bed_utils.write_bed(bed12_df[bed_utils.bed12_field_names], args.out)
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script converts the CCDS text files distributed by NCBI to "
        "valid BED12 files for use with other programs. Please see "
        "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/README, \"CCDS.[YearMonthDay].txt\" for "
        "more information.")
    parser.add_argument('ccds', help="The CCDS.txt file downloaded from NCBI")
    parser.add_argument('out', help="The output bed.gz file")

    parser.add_argument('-i',
                        '--ignore',
                        help="The ccds_status entries to ignore.",
                        default=default_ccds_status_to_ignore,
                        nargs='*')

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use "
                        "for extracting the exon information",
                        type=int,
                        default=default_num_cpus)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading CCDS file"
    logger.info(msg)

    ccds_df = pd.read_csv(args.ccds, sep='\t')

    msg = "Copying simple values to BED"
    logger.info(msg)

    ccds_bed = pd.DataFrame()
    ccds_bed['seqname'] = ccds_df['#chromosome']
    ccds_bed['start'] = ccds_df['cds_from']
    ccds_bed['end'] = ccds_df['cds_to']
    ccds_bed['id'] = ccds_df['gene'] + ":" + ccds_df['ccds_id']
    ccds_bed['score'] = 0
    ccds_bed['strand'] = ccds_df['cds_strand']
    ccds_bed['thick_start'] = ccds_df['cds_from']
    ccds_bed['thick_end'] = ccds_df['cds_to']
    ccds_bed['color'] = 0

    msg = "Converting CCDS exons into BED12 blocks"
    logger.info(msg)

    cds_exon_info = parallel.apply_parallel(ccds_df,
                                            args.num_cpus,
                                            parse_cds_locations,
                                            args,
                                            progress_bar=True)

    cds_exon_info = [cei for cei in cds_exon_info if cei is not None]
    cds_exon_df = pd.DataFrame(cds_exon_info)

    msg = "Merging simple values and blocks"
    logger.info(msg)

    ccds_bed = ccds_bed.merge(cds_exon_df, on='id')

    # put the columns in the correct order
    ccds_bed = ccds_bed[bio.bed12_field_names]

    msg = "Writing the BED file"
    logger.info(msg)

    bed_utils.write_bed(ccds_bed, args.out)