def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script removes all duplicate sequences from a list of fasta "
        "files and writes the remaining sequences back out as a fasta file."
        "\n\n"
        "If desired, a regular expression can be given for \"lower precedence\" "
        "sequence identifiers. An example of using this precedence operator "
        "is in removing duplicate sequences from a fasta file which combines "
        "de novo assembled transcripts and annotated ones. In case a de novo "
        "transcript matches an annotated one, we would prefer to keep only "
        "the annotated transcript and identifier. Thus, we would pass an RE "
        "matching the de novo assembled identifiers (which have a lower precedence)."
        "\n\n"
        "If a precedence re is not given, or two identifiers have the same "
        "precedence, the first identifier encountered will be kept.")

    parser.add_argument('fasta', help="The input fasta file(s)", nargs='+')
    parser.add_argument('-o', '--out', help="The output (fasta[.gz]) file",
        required=True)
    parser.add_argument('--compress', help="If this flag is given, then the output "
        "will be gzipped.", action='store_true')
    parser.add_argument('-l', '--lower-precedence-re', help="A regular expression "
        "that matches the identifiers of lower precendence transcripts. (See the "
        "description for more details.)", default=default_lower_precedence_re)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    files = '\n'.join(args.fasta)
    msg = "Found the following files from the command line: {}".format(files)
    logger.info(msg)

    fastx_utils.remove_duplicate_sequences(args.fasta, args.out, compress=args.compress, 
        lower_precedence_re=args.lower_precedence_re, progress_bar=True)
Example #2
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script filters bam A by the read ids in bam B. In "
        "particular, only the reads in A with ids *which appear* in B are kept.")

    parser.add_argument('bam_a', help="The bam file to filter")
    parser.add_argument('bam_b', help="The bam file whose ids will be kept in A")
    parser.add_argument('bam_out', help="The output (bam) file")
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading identifiers from B"
    logger.info(msg)
    ids_to_keep = bam_utils.get_read_identifiers(args.bam_b)

    msg = "Filtering reads from A which do not appear in B"
    logger.info(msg)

    with ExitStack() as stack:
        bam_a = stack.enter_context(bam_utils.get_pysam_alignment_file(args.bam_a))
        bam_out = stack.enter_context(bam_utils.get_pysam_alignment_file(
            args.bam_out, "wb", template=bam_a))

        for read in bam_a.fetch():
            if read.query_name in ids_to_keep:
                bam_out.write(read)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script removes all reads with multiple alignments from a bam file. "
        "It then sorts and indexes the reads.")
    parser.add_argument('align_in', help="The input alignment file")
    parser.add_argument(
        'align_out',
        help="The output alignment file with multimappers removed")
    parser.add_argument(
        '--tmp',
        help="The path where temporary files for samtools sort will "
        "be stored. If not given, then the samtools default tmp choice will be used.",
        default=default_tmp)

    parser.add_argument('--do-not-call', action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    call = not args.do_not_call
    bam_utils.remove_multimapping_reads(args.align_in,
                                        args.align_out,
                                        call=call,
                                        tmp=args.tmp)
def parse_arguments() -> argparse.Namespace:

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__)

    parser.add_argument('config',
                        help="The path to the yaml configuration "
                        "file.")

    parser.add_argument('--chunk-size',
                        type=int,
                        default=100,
                        help="The size "
                        "of chunks for parallelization")

    parser.add_argument(
        '--num-notes',
        type=int,
        default=None,
        help="The "
        "number of notes to read in. This is mostly for debugging purposes.")

    dask_utils.add_dask_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)
    return args
Example #5
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script replaces all score and color values in bed files with "
        "'0'. It applies this to all bed files in the current directory.")

    parser.add_argument('--no-ask', help="By default, the program will ask to replace "
        "the values for each bed file. If this flag is given, then the asking will be "
        "skipped.", action='store_true')

    parser.add_argument('--bed-extensions', help="The extensions to treat as "
        "bed files", nargs='+', default=default_bed_extensions)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    ask = not args.no_ask

    for bed_extension in args.bed_extensions:
        re = "*{}".format(bed_extension)
        bed_files = glob.glob(re)

        for bed_file in bed_files:
            print("fix: {}".format(bed_file))
            if (not ask) or fix_bed(bed_file):
                bed = bed_utils.read_bed(bed_file)
                bed['score'] = 0
                bed['color'] = 0

                bed_utils.write_bed(bed, bed_file)
Example #6
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script counts the number of reads in the given fastq "
        "(possibly gzipped) files.")

    parser.add_argument('files', help="A glob-style re giving the filenames", 
        nargs='+')

    parser.add_argument('-o', '--out', help="A (csv.gz) output file", required=True)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    datasets = []
    read_counts = []

    for f in args.files:
        msg = "Processing file: {}".format(f)
        logger.info(msg)

        read_count = fastx_utils.get_read_count(f, is_fasta=False)

        datasets.append(pyllars.utils.get_basename(f))
        read_counts.append(read_count)

    df = pd.DataFrame()
    df['dataset'] = datasets
    df['reads'] = read_counts

    msg = "Writing data frame to disk"
    logger.info(msg)

    pd_utils.write_df(df, args.out, index=False)
Example #7
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script splits BED12+ files into a BED6+ file. Each block "
        "(i.e., exon) in the original file is an individual feature in the new file. "
        "There are two extra fields, exon_index and transcript_start, which give the "
        "index of the exon within its transcript and the start of the exon in the "
        "\"spliced\" version of the transcript. The \"id\" column in the original file "
        "is used as the \"id\" in the new file, so the exons can easily be grouped.")

    parser.add_argument('bed', help="The BED12+ file")
    parser.add_argument('out', help="The output BED6+2 file")

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use",
        type=int, default=default_num_cpus)

    parser.add_argument('--num-groups', help="The number of groups to split the "
        "bed file into for parallelization", type=int, default=default_num_groups)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)
    
    msg = "Reading BED12+ file"
    logger.info(msg)

    bed = bed_utils.read_bed(args.bed)

    msg = "Splitting blocks"
    logger.info(msg)

    exons = parallel.apply_parallel_split(
        bed,
        args.num_cpus,
        #bio.split_bed12_blocks,
        split_all_blocks,
        progress_bar=True,
        num_groups = args.num_groups
    )

    msg = "Merging exons into a data frame"
    logger.info(msg)

    #exons = utils.flatten_lists(exons)
    #exons = pd.DataFrame(exons)
    exons = pd.concat(exons)

    fields = bed_utils.bed6_field_names + ['exon_index', 'transcript_start']
    exons = exons[fields]

    msg = "Writing BED6+2 file"
    logger.info(msg)

    bed_utils.write_bed(exons, args.out)
Example #8
0
def parse_arguments() -> argparse.Namespace:

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__)

    parser.add_argument('config',
                        help="The path to the yaml configuration "
                        "file.")

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)
    return args
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Remove duplicate entries from a list of bed12+ files. "
        "Write the non-redundant entries to a new file. Precedence among "
        "duplicates is arbitrary.")

    parser.add_argument('bed', help="The input bed file(s)", nargs='+')
    parser.add_argument('-o', '--out', help="The output bed(.gz) file",
        required=True)

    parser.add_argument('--compress', help="If this flag is given, the output "
        "will be gzipped. The output filename *will not* be changed (so it "
        "should already end in \".gz\").", action='store_true')
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading bed files"
    logger.info(msg)

    all_bed = [
        bed_utils.read_bed(b) for b in args.bed
    ]

    for f, b in zip(args.bed, all_bed):
        msg = "{}. number of entities: {}".format(f, len(b))
        logger.debug(msg)

    msg = "Concatenating bed entries"
    logger.info(msg)
    all_bed_df = pd.concat(all_bed)

    msg = "Removing duplicate entries"
    logger.info(msg)
    all_bed_df = all_bed_df.drop_duplicates(subset=DUPLICATE_FIELDS)

    msg = "number of non-redundant entries: {}".format(len(all_bed_df))
    logger.debug(msg)

    msg = "Sorting non-redundant entries"
    logger.info(msg)
    sort_fields = ['seqname', 'start', 'end', 'strand']
    all_bed_df = all_bed_df.sort_values(by=sort_fields)

    msg = "Writing sorted, non-redundant entries to disk"
    logger.info(msg)
    bed_utils.write_bed(all_bed_df, args.out, compress=args.compress)
Example #10
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script counts the number of uniquely- and multi-mapping "
        "reads in a list of bam files.")

    parser.add_argument('files', help="A glob-style re giving the filenames", 
        nargs='+')

    parser.add_argument('-o', '--out', help="A (csv.gz) output file", required=True)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    datasets= []
    aligned_reads = []
    uniquely_aligned_reads = []

    for f in args.files:
        msg = "Processing file: {}".format(f)
        logger.info(msg)

        num_aligned_reads = bam_utils.count_aligned_reads(f)
        num_uniquely_aligned_reads = bam_utils.count_uniquely_mapping_reads(f)

        datasets.append(pyllars.utils.get_basename(f))
        aligned_reads.append(num_aligned_reads)
        uniquely_aligned_reads.append(num_uniquely_aligned_reads)

    msg = "Constructing data frame"
    logger.info(msg)

    df = pd.DataFrame()
    df['dataset'] = datasets
    df['aligned_reads'] = aligned_reads
    df['uniquely_aligned_reads'] = uniquely_aligned_reads
    df['multimapping_reads'] = df['aligned_reads'] - df['uniquely_aligned_reads']

    msg = "Writing data frame to disk"
    logger.info(msg)

    pd_utils.write_df(df, args.out, index=False)
Example #11
0
def parse_arguments() -> argparse.Namespace:

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=__doc__
    )

    parser.add_argument('config', help="The path to the yaml configuration "
        "file.")

    parser.add_argument('--seed', type=int, default=8675309, help="The seed "
        "for the random number generator")

    parser.add_argument('--chunk-size', type=int, default=100, help="The size "
        "of chunks for parallelization")

    dask_utils.add_dask_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)
    return args
Example #12
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script removes duplicate reads from paired-end fastq files. "
        "It only treats exact matches as duplicates (i.e., if reads are of different "
        "lengths, it does not consider exact substring matches as duplicates). The "
        "duplicate with the highest average quality score is retained.\n\nThis script "
        "is not designed to work with fasta files.")

    parser.add_argument('fastq_1', help="The first mate file")
    parser.add_argument('fastq_2', help="The second mate file")

    parser.add_argument('out_1', help="The de-duped first mate file")
    parser.add_argument('out_2', help="The de-duped second mate file")

    parser.add_argument(
        '--do-not-compress',
        help="Unless this flag is given, the "
        "output will be gzipped. N.B. \".gz\" *will not* be adde to the file names.",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    compress = not args.do_not_compress

    msg = "Counting reads in each file"
    logger.info(msg)

    # we will use the counts for displaying a progress bar
    num_reads_1 = fastx_utils.get_read_count(args.fastq_1, is_fasta=False)
    num_reads_2 = fastx_utils.get_read_count(args.fastq_2, is_fasta=False)

    # but avoid errors by making sure the counts match
    if num_reads_1 != num_reads_2:
        msg = "The number of reads in the files do not match ({} vs. {})".format(
            num_reads_1, num_reads_2)
        raise ValueError(msg)

    msg = "Creating read iterators"
    logger.info(msg)

    fastq_1_iter = fastx_utils.get_read_iterator(args.fastq_1, is_fasta=False)
    fastq_2_iter = fastx_utils.get_read_iterator(args.fastq_2, is_fasta=False)
    fastq_iter = zip(fastq_1_iter, fastq_2_iter)

    msg = "Detecting duplicates"
    logger.info(msg)

    seen_reads = {}

    for r1, r2 in tqdm.tqdm(fastq_iter, total=num_reads_1):
        r_key = (r1[1], r2[1])

        prev_val = seen_reads.get(r_key, None)

        if prev_val is None:
            srp = stored_read_pair(r1_name=r1[0],
                                   r2_name=r2[0],
                                   r1_qual=r1[2],
                                   r2_qual=r2[2])
            seen_reads[r_key] = srp
        else:
            new_qual_score = sum(r1[2].encode()) + sum(r1[2].encode())
            prev_qual_score = (sum(prev_val.r1_qual.encode()) +
                               sum(prev_val.r2_qual.encode()))

            if new_qual_score > prev_qual_score:
                srp = stored_read_pair(r1_name=r1[0],
                                       r2_name=r2[0],
                                       r1_qual=r1[2],
                                       r2_qual=r2[2])
                seen_reads[r_key] = srp

    msg = "Writing the de-duped files to disk"
    logger.info(msg)

    with ExitStack() as stack:
        out_1 = pyllars.utils.open_file(args.out_1, 'w', compress=compress)
        out_2 = pyllars.utils.open_file(args.out_2, 'w', compress=compress)

        for (seqs, srp) in tqdm.tqdm(seen_reads.items()):
            fastx_utils._write_fastq_entry(out_1, srp.r1_name, seqs[0],
                                           srp.r1_qual)
            fastx_utils._write_fastq_entry(out_2, srp.r2_name, seqs[1],
                                           srp.r2_qual)
Example #13
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates a report for a list of Entrez or Entrez "
        "gene identifiers. In particular, it extracts information from Swiss-Prot, "
        "TrEMBL, Interpro, PDB, Pfam, PROSITE, the Gene Ontology, and KEGG. It uses "
        "the mygene.info service to collect the annotations.")

    parser.add_argument('filename', help="The name of the file")
    parser.add_argument(
        'out',
        help="The output file. It will contain the same information "
        "as the input file with the gene annotations appended as new columns. The format "
        "is the same as the input format.")

    parser.add_argument(
        '-f',
        '--filetype',
        help="The format of the input file. By default, "
        "the format will be guessed based on the file extension.",
        choices=filetype_choices,
        default=default_filetype)

    parser.add_argument('--sep',
                        help="The spearator in the file (if csv)",
                        default=default_sep)

    parser.add_argument(
        '-s',
        '--sheet',
        help="The name of the sheet (for excel files) "
        "or key (for hdf5 files) from which the gene list is extracted. By default, the "
        "first sheet in an excel file is used. This argument is not used for csv files.",
        default=default_sheet)

    parser.add_argument(
        '-c',
        '--column',
        help="The name of the column (or key in hdf5) "
        "from which the gene list is extracted. By default, the first column in the "
        "extracted data frame is used.",
        default=default_column)

    parser.add_argument(
        '--do-not-compress',
        help="If this flag is present and the file "
        "is in csv format, then the output will not be compressed. By default, the output "
        "is compressed.",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading the file"
    logger.info(msg)

    df = pd_utils.read_df(args.filename,
                          filetype=args.filetype,
                          sheet=args.sheet,
                          sep=args.sep)

    msg = "Extracting gene identifiers"
    logger.info(msg)

    if args.column is None:
        args.column = df.columns[0]

    gene_ids = df[args.column]

    msg = "Pulling information from mygene.info"
    logger.info(msg)

    res_df = mygene_utils.query_mygene(gene_ids)

    msg = "Joining results to original input"
    logger.info(msg)

    res_df = df.merge(res_df,
                      left_on=args.column,
                      right_on='gene_id',
                      how='inner')

    msg = "Writing output"
    logger.info(msg)

    pd_utils.write_df(res_df,
                      args.out,
                      filetype=args.filetype,
                      sheet=args.sheet,
                      do_not_compress=args.do_not_compress,
                      index=False)

    msg = "Finished"
    logger.info(msg)
Example #14
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script counts the number of unique reads in the "
        "given files, all of which must be the same type. In the case of bam "
        "files, it only counts primary alignments (so it does not "
        "double-count multimappers, and it does not include unmapped reads "
        "present in the file.")

    parser.add_argument('files',
                        help="The fasta, fastq or bam files",
                        nargs='+')
    parser.add_argument('-o',
                        '--out',
                        help="The (csv.gz) output file "
                        "containing the lengths and counts",
                        required=True)

    parser.add_argument(
        '-f',
        '--file-type',
        help="The type of the files. All "
        "files must be of the same type. If the \"AUTO\" file type is given, "
        "then the type will be guessed on the extension of the first file "
        "using the following heuristic: \"bam\" if the extension is\".bam\" "
        "or \".sam\"; "
        "\"fastq\" if the extension is \"fastq\", \"fastq.gz\", \"fq\", or "
        "\"fq.gz\"; \"fasta\" otherwise.",
        choices=file_type_choices,
        default=default_file_type)

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use",
                        type=int,
                        default=default_num_cpus)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.file_type == "AUTO":
        args.file_type = guess_file_type(args.files[0])
        msg = "The guessed file type is: {}".format(args.file_type)
        logger.info(msg)

    # grab the correct function pointer
    get_length_distribution = file_type_get_length_distribution[args.file_type]

    msg = "Collecting all read length distributions"
    logger.info(msg)

    all_length_distribution_dfs = parallel.apply_parallel_iter(
        args.files, args.num_cpus, get_length_distribution, progress_bar=True)

    msg = "Combining data frames into one large df"
    logger.info(msg)
    length_distribution_df = pd.concat(all_length_distribution_dfs)

    msg = "Writing counts to disk"
    logger.info(msg)

    pd_utils.write_df(length_distribution_df, args.out, index=False)
Example #15
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script converts a GTF file to a BED12 file. In particular, "
        "it creates bed entries based on the exon features and transcript_id field. "
        "It uses the CDS regions to determine the \"thick_start\" and \"thick_end\" "
        "features of the BED12 file.")

    parser.add_argument('gtf', help="The GTF file")
    parser.add_argument('out', help="The (output) BED12 file")

    parser.add_argument(
        '--chr-name-file',
        help="If this file is given, then the "
        "bed entries will be sorted according to the order of seqnames in this "
        "file. Presumably, this is the chrName.txt file from STAR.",
        default=default_chr_name_file)

    parser.add_argument('--exon-feature',
                        help="The name of features which are "
                        "treated as exons",
                        default=default_exon_feature)
    parser.add_argument('--cds-feature',
                        help="The name of features which are "
                        "treated as CDSs",
                        default=default_cds_feature)

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use",
                        type=int,
                        default=default_num_cpus)
    parser.add_argument('-g',
                        '--num-groups',
                        help="The number of groups to split "
                        "into for parallelization",
                        type=int,
                        default=default_num_groups)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading GTF file"
    logger.info(msg)

    gtf = gtf_utils.read_gtf(args.gtf)

    msg = "Extracting exon and CDS features"
    logger.info(msg)

    m_exons = gtf['feature'] == args.exon_feature
    m_cds = gtf['feature'] == args.cds_feature

    exons = gtf[m_exons].copy()
    cds_df = gtf[m_cds].copy()

    msg = "Extracting CDS transcript ids"
    logger.info(msg)

    cds_transcript_ids = parallel.apply_parallel_split(
        cds_df,
        args.num_cpus,
        get_transcript_ids,
        progress_bar=True,
        num_groups=args.num_groups)
    cds_transcript_ids = collection_utils.flatten_lists(cds_transcript_ids)
    cds_df['transcript_id'] = cds_transcript_ids

    msg = "Calculating CDS genomic start and end positions"
    logger.info(msg)

    cds_groups = cds_df.groupby('transcript_id')

    # we subtract 1 from start because gtf is 1-based
    cds_min_starts = cds_groups['start'].min()
    cds_start_df = pd.DataFrame()
    cds_start_df['id'] = cds_min_starts.index
    cds_start_df['cds_start'] = cds_min_starts.values - 1

    # we do not subtract 1 from end because bed is "open" on the end
    cds_max_end = cds_groups['end'].max()
    cds_end_df = pd.DataFrame()
    cds_end_df['id'] = cds_max_end.index
    cds_end_df['cds_end'] = cds_max_end.values

    msg = "Extracting exon transcript ids"
    logger.info(msg)

    exon_transcript_ids = parallel.apply_parallel_split(
        exons,
        args.num_cpus,
        get_transcript_ids,
        progress_bar=True,
        num_groups=args.num_groups)
    exon_transcript_ids = collection_utils.flatten_lists(exon_transcript_ids)
    exons['transcript_id'] = exon_transcript_ids

    exons['length'] = exons['end'] - exons['start'] + 1
    exons['length'] = exons['length'].astype(str)

    # store these for sorting later
    transcript_ids = np.array(exons['transcript_id'])

    msg = "Combining exons into BED12 entries"
    logger.info(msg)

    exons = exons.sort_values('start')
    exon_groups = exons.groupby('transcript_id')

    bed12_df = parallel.apply_parallel_groups(exon_groups,
                                              args.num_cpus,
                                              get_bed12_entry,
                                              progress_bar=True)
    bed12_df = pd.DataFrame(bed12_df)

    msg = "Joining BED12 entries to CDS information"
    logger.info(msg)

    bed12_df = bed12_df.merge(cds_start_df, on='id', how='left')
    bed12_df = bed12_df.merge(cds_end_df, on='id', how='left')

    bed12_df = bed12_df.fillna(-1)

    bed12_df['thick_start'] = bed12_df['cds_start'].astype(int)
    bed12_df['thick_end'] = bed12_df['cds_end'].astype(int)

    msg = "Sorting BED12 entries"
    logger.info(msg)

    # We will break ties among transcripts by the order they appear
    # in the GTF file. This is the same way star breaks ties.
    bed12_df = bed_utils.sort(bed12_df,
                              seqname_order=args.chr_name_file,
                              transcript_ids=transcript_ids)

    msg = "Writing BED12 to disk"
    logger.info(msg)

    bed_utils.write_bed(bed12_df[bed_utils.bed12_field_names], args.out)
Example #16
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script converts the CCDS text files distributed by NCBI to "
        "valid BED12 files for use with other programs. Please see "
        "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/README, \"CCDS.[YearMonthDay].txt\" for "
        "more information.")
    parser.add_argument('ccds', help="The CCDS.txt file downloaded from NCBI")
    parser.add_argument('out', help="The output bed.gz file")

    parser.add_argument('-i',
                        '--ignore',
                        help="The ccds_status entries to ignore.",
                        default=default_ccds_status_to_ignore,
                        nargs='*')

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use "
                        "for extracting the exon information",
                        type=int,
                        default=default_num_cpus)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading CCDS file"
    logger.info(msg)

    ccds_df = pd.read_csv(args.ccds, sep='\t')

    msg = "Copying simple values to BED"
    logger.info(msg)

    ccds_bed = pd.DataFrame()
    ccds_bed['seqname'] = ccds_df['#chromosome']
    ccds_bed['start'] = ccds_df['cds_from']
    ccds_bed['end'] = ccds_df['cds_to']
    ccds_bed['id'] = ccds_df['gene'] + ":" + ccds_df['ccds_id']
    ccds_bed['score'] = 0
    ccds_bed['strand'] = ccds_df['cds_strand']
    ccds_bed['thick_start'] = ccds_df['cds_from']
    ccds_bed['thick_end'] = ccds_df['cds_to']
    ccds_bed['color'] = 0

    msg = "Converting CCDS exons into BED12 blocks"
    logger.info(msg)

    cds_exon_info = parallel.apply_parallel(ccds_df,
                                            args.num_cpus,
                                            parse_cds_locations,
                                            args,
                                            progress_bar=True)

    cds_exon_info = [cei for cei in cds_exon_info if cei is not None]
    cds_exon_df = pd.DataFrame(cds_exon_info)

    msg = "Merging simple values and blocks"
    logger.info(msg)

    ccds_bed = ccds_bed.merge(cds_exon_df, on='id')

    # put the columns in the correct order
    ccds_bed = ccds_bed[bio.bed12_field_names]

    msg = "Writing the BED file"
    logger.info(msg)

    bed_utils.write_bed(ccds_bed, args.out)
Example #17
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script removes all of the entries from A which overlap "
        "any of the entries from B. Optionally, some minimum overlap can be "
        "given, in terms of overlap fraction.")

    parser.add_argument('bed_a',
                        help="The bed file from which entries will be "
                        "removed")
    parser.add_argument('bed_b',
                        help="The bed file used to find entries in A "
                        "to remove")

    parser.add_argument('out', help="The output (bed.gz) file")

    parser.add_argument(
        '--min-a-overlap',
        help="A minimum fraction required "
        "for overlap of the A entries. \"0\" means \"at least one bp.\"",
        type=float,
        default=default_min_a_overlap)

    parser.add_argument(
        '--min-b-overlap',
        help="A minimum fraction required "
        "for overlap of the B entries. \"0\" means \"at least one bp.\"",
        type=float,
        default=default_min_b_overlap)

    parser.add_argument(
        '--split',
        help="If this flag is given, then the bed "
        "entries in both files will be split. This can be somewhat slow, "
        "depending on the number of entries in the files.",
        action='store_true')

    parser.add_argument(
        '--exons',
        help="If the bed entries have already been "
        "split and the exon bed6+2 file (from split-bed12-blocks program) is "
        "available, then that can be given with this option. The exons from "
        "that file will be used for both A and B.",
        default=None)

    parser.add_argument('--exons-a',
                        help="As with the --exons argument, but "
                        "these exons will only be used for A",
                        default=None)

    parser.add_argument('--exons-b',
                        help="As with the --exons argument, but "
                        "these exons will only be used for B",
                        default=None)

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use for "
                        "certain parts of the script",
                        type=int,
                        default=default_num_cpus)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    exons_given = args.exons is not None
    exons_a_given = args.exons_a is not None
    exons_b_given = args.exons_b is not None

    # check if the exons files exist
    if exons_given and (not os.path.exists(args.exons)):
        msg = "The exons file does not exist: {}".format(args.exons)
        raise FileNotFoundError(msg)

    if exons_a_given and (not os.path.exists(args.exons_a)):
        msg = "The exons_a file does not exist: {}".format(args.exons_a)
        raise FileNotFoundError(msg)

    if exons_b_given and (not os.path.exists(args.exons_b)):
        msg = "The exons_b file does not exist: {}".format(args.exons_b)
        raise FileNotFoundError(msg)

    exons_a_only = exons_a_given and not exons_b_given
    exons_b_only = not exons_a_given and exons_b_given
    if exons_a_only or exons_b_only:
        msg = ("Only one of --exons-a, --exons-b was given. This is valid, "
               "but please ensure this is the desired behavior.")
        logger.warning(msg)

    # make sure we weren't given contradictory flags
    if args.split and exons_given:
        msg = "Both --split and --exons were given. Only one of these is allowed."
        raise ValueError(msg)

    if exons_given and (exons_a_given or exons_b_given):
        msg = (
            "Both --exons and (--exons-a or --exons-b) were given. --exons "
            "should not be given with the --exons-a and --exons-b arguments.")
        raise ValueError(msg)

    exons = None
    exons_a = None
    exons_b = None

    msg = "Reading bed A"
    logger.info(msg)
    bed_a = bed_utils.read_bed(args.bed_a)

    msg = "Reading bed B"
    logger.info(msg)
    bed_b = bed_utils.read_bed(args.bed_b)

    if args.split:
        msg = "Splitting bed A"
        logger.info(msg)

        exons_a = parallel.apply_parallel_split(bed_a,
                                                args.num_cpus,
                                                split_all_blocks,
                                                progress_bar=True,
                                                num_groups=args.num_groups)

        exons_a = pd.concat(exons_a)

        msg = "Splitting bed B"
        logger.info(msg)

        exons_b = parallel.apply_parallel_split(bed_b,
                                                args.num_cpus,
                                                split_all_blocks,
                                                progress_bar=True,
                                                num_groups=args.num_groups)

        exons_b = pd.concat(exons_b)

    if exons_given:
        msg = "Reading exons"
        logger.info(msg)
        exons = bed_utils.read_bed(args.exons)

    if exons_a_given:
        msg = "Reading A exons"
        logger.info(msg)
        exons_a = bed_utils.read_bed(args.exons_a)

    if exons_b_given:
        msg = "Reading B exons"
        logger.info(msg)
        exons_b = bed_utils.read_bed(args.exons_b)

    msg = "Finding all A entries which overlap B entries"
    logger.info(msg)

    remaining_a_ids = bed_utils.subtract_bed(bed_a,
                                             bed_b,
                                             min_a_overlap=args.min_a_overlap,
                                             min_b_overlap=args.min_b_overlap,
                                             exons=exons,
                                             exons_a=exons_a,
                                             exons_b=exons_b)

    msg = "Filtering the A entries which had overlaps"
    logger.info(msg)

    m_remaining = bed_a['id'].isin(remaining_a_ids)
    bed_a_remaining = bed_a[m_remaining]

    msg = "Writing remaining A entries to disk"
    logger.info(msg)

    bed_utils.write_bed(bed_a_remaining, args.out)
Example #18
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script merges either the exons or CDS regions of all transcript "
        "isoforms into a single \"super gene isoform\". It does this based on the given "
        "GTF feature type and attribute (with defaults \"CDS\" and \"gene_id\", respectively)."
    )
    parser.add_argument('gtf', help="The GTF file")
    parser.add_argument('out', help="The output (merged) GTF file")

    parser.add_argument('--feature-type',
                        help="The type of features to merge",
                        default=default_feature_type)
    parser.add_argument('--group-attribute',
                        help="The attribute by which the features "
                        "will be merged",
                        default=default_group_attribute)

    parser.add_argument('--id-format-str',
                        help="The python format string to "
                        "use for creating the \"transcript\" identifiers",
                        default=default_id_format_str)

    parser.add_argument(
        '--chr-name-file',
        help="If this file is specified, it will "
        "be used to determine the seqname sort order. This should be the "
        "\"chrName.txt\" file created by STAR. If not present, the transcripts "
        "will be sorted alphabetically (1, 10, 11, 2, ..., KL568162.1, MT, X, Y).",
        default=default_chr_name_file)

    parser.add_argument(
        '--add-exons',
        help="If this flag is given, then all features will "
        "be duplicated, but with the feature type \"exon\". Presumably, this should be given "
        "when \"CDS\" features are merged, and the resulting GTF file will be used by STAR "
        "(or anything else expecting \"exon\"s).",
        action='store_true')

    parser.add_argument(
        '-g',
        '--num-groups',
        help="The number of groups into which to split "
        "the features. More groups means the progress bar is updated more frequently but incurs "
        "more overhead because of the parallel calls.",
        type=int,
        default=default_num_groups)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Reading GTF file"
    logger.info(msg)

    gtf_df = gtf_utils.read_gtf(args.gtf)

    msg = "Extracting desired features"
    logger.info(msg)
    m_feature_type = gtf_df['feature'] == args.feature_type
    gtf_feature_df = gtf_df[m_feature_type]

    msg = "Parsing GTF attributes"
    logger.info(msg)

    attributes = parallel.apply_parallel_split(gtf_feature_df,
                                               args.num_cpus,
                                               parse_attributes_group,
                                               progress_bar=True,
                                               num_groups=args.num_groups)

    attributes_df = pd.concat(attributes)
    attributes_df['end'] = attributes_df['end'].astype(int)
    attributes_df['start'] = attributes_df['start'].astype(int)

    msg = "Merging isoforms"
    logger.info(msg)

    gene_features = attributes_df.groupby(args.group_attribute)
    merged_genes = parallel.apply_parallel_groups(gene_features,
                                                  args.num_cpus,
                                                  merge_gene_group,
                                                  args.group_attribute,
                                                  args.id_format_str,
                                                  progress_bar=True)

    merged_genes_df = pd.concat(merged_genes)

    if args.add_exons:
        merged_exons = merged_genes_df.copy()
        merged_exons['feature'] = 'exon'
        merged_genes_df = pd.concat([merged_exons, merged_genes_df])

    merged_genes_df['start'] = merged_genes_df['start'].astype(int)

    # now, sort the merged isoforms

    # this is a bit of a hack, because it is actually using the sorting routine
    # for bed data frames

    # we need a dummy 'id' column for sorting, so just use the attributes
    merged_genes_df['id'] = merged_genes_df['attributes']
    merged_genes_df = bed_utils.sort(merged_genes_df,
                                     seqname_order=args.chr_name_file)

    # last, drop duplicate rows
    fields = ['seqname', 'source', 'feature', 'start', 'end', 'strand']
    merged_genes_df = merged_genes_df.drop_duplicates(subset=fields)

    gtf_utils.write_gtf(merged_genes_df, args.out, compress=False)
Example #19
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script splits large chromosomes into several smaller ones. This "
        "is required to use BedTools for chromosomes with size larger than about 500M. In "
        "particular, this script splits chromosomal sequences into smaller chunks and "
        "updates GTF annotations to use the smaller chromosomes.\n\nFor more information, "
        "see https://groups.google.com/forum/#!topic/bedtools-discuss/t-nQSCxaFGE"
    )
    parser.add_argument('fasta', help="The chromosome sequence file")
    parser.add_argument('gtf', help="The annotation file")
    parser.add_argument(
        'out',
        help="The base output files. The script will create the "
        "files <out>.fa and <out>.gtf.")
    parser.add_argument('--max-size',
                        help="The largest allowed size (in bp) for a "
                        "chromosome",
                        type=int,
                        default=default_max_size)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Splitting fasta sequences"
    logging.info(msg)

    fasta = fastx_utils.get_read_iterator(args.fasta, is_fasta=True)
    split_fasta = {}

    for name, seq in fasta:
        name_split = name.split(" ")
        split_seqs = [
            l
            for l in iter(partial(StringIO(seq).read, int(args.max_size)), '')
        ]

        for i, split_seq in enumerate(split_seqs):
            n = name_split[0]  # a bit of a hack to save the base sequence name
            name_split[0] = "{}_{}".format(name_split[0], i)
            split_name = ' '.join(name_split)

            split_fasta[split_name] = split_seq

            name_split[0] = n

    msg = "Writing fasta output file"
    logging.info(msg)

    fasta_out = "{}.fa".format(args.out)
    fastx_utils.write_fasta(split_fasta,
                            fasta_out,
                            compress=False,
                            progress_bar=True)

    msg = "Reading GTF"
    logging.info(msg)

    gtf = gtf_utils.read_gtf(args.gtf)

    msg = "Updating GTF coordinates"
    logging.info(msg)

    # get the split for each feature
    start_split_num = gtf['start'] // args.max_size
    end_split_num = gtf['end'] // args.max_size

    # wrap the coordinates based on the max size
    gtf['start'] = np.mod(gtf['start'], args.max_size)
    gtf['end'] = np.mod(gtf['end'], args.max_size)

    # for the names, we need them as strings without ".0" at the end
    start_split_num = start_split_num.astype(int)
    start_split_num = start_split_num.astype(str)

    end_split_num = end_split_num.astype(int)
    end_split_num = end_split_num.astype(str)

    # create the start and end names
    split_start_seqname = gtf['seqname'] + "_" + start_split_num
    split_end_seqname = gtf['seqname'] + "_" + end_split_num

    gtf['start_seqname'] = split_start_seqname
    gtf['end_seqname'] = split_end_seqname

    # remove features which span the gaps
    m_span = gtf['start_seqname'] != gtf['end_seqname']
    gtf = gtf[~m_span]

    num_spanning = sum(m_span)
    msg = (
        "Number of features spanning length boundaries: {}.\n\nThese features "
        "will be discarded.".format(num_spanning))
    logging.warning(msg)

    # and update the seqnames
    gtf['seqname'] = gtf['start_seqname']

    msg = "Writing GTF output file"
    logging.info(msg)

    gtf_out = "{}.gtf".format(args.out)
    gtf_utils.write_gtf(gtf, gtf_out, compress=False)