def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script removes all duplicate sequences from a list of fasta " "files and writes the remaining sequences back out as a fasta file." "\n\n" "If desired, a regular expression can be given for \"lower precedence\" " "sequence identifiers. An example of using this precedence operator " "is in removing duplicate sequences from a fasta file which combines " "de novo assembled transcripts and annotated ones. In case a de novo " "transcript matches an annotated one, we would prefer to keep only " "the annotated transcript and identifier. Thus, we would pass an RE " "matching the de novo assembled identifiers (which have a lower precedence)." "\n\n" "If a precedence re is not given, or two identifiers have the same " "precedence, the first identifier encountered will be kept.") parser.add_argument('fasta', help="The input fasta file(s)", nargs='+') parser.add_argument('-o', '--out', help="The output (fasta[.gz]) file", required=True) parser.add_argument('--compress', help="If this flag is given, then the output " "will be gzipped.", action='store_true') parser.add_argument('-l', '--lower-precedence-re', help="A regular expression " "that matches the identifiers of lower precendence transcripts. (See the " "description for more details.)", default=default_lower_precedence_re) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) files = '\n'.join(args.fasta) msg = "Found the following files from the command line: {}".format(files) logger.info(msg) fastx_utils.remove_duplicate_sequences(args.fasta, args.out, compress=args.compress, lower_precedence_re=args.lower_precedence_re, progress_bar=True)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script filters bam A by the read ids in bam B. In " "particular, only the reads in A with ids *which appear* in B are kept.") parser.add_argument('bam_a', help="The bam file to filter") parser.add_argument('bam_b', help="The bam file whose ids will be kept in A") parser.add_argument('bam_out', help="The output (bam) file") logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading identifiers from B" logger.info(msg) ids_to_keep = bam_utils.get_read_identifiers(args.bam_b) msg = "Filtering reads from A which do not appear in B" logger.info(msg) with ExitStack() as stack: bam_a = stack.enter_context(bam_utils.get_pysam_alignment_file(args.bam_a)) bam_out = stack.enter_context(bam_utils.get_pysam_alignment_file( args.bam_out, "wb", template=bam_a)) for read in bam_a.fetch(): if read.query_name in ids_to_keep: bam_out.write(read)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script removes all reads with multiple alignments from a bam file. " "It then sorts and indexes the reads.") parser.add_argument('align_in', help="The input alignment file") parser.add_argument( 'align_out', help="The output alignment file with multimappers removed") parser.add_argument( '--tmp', help="The path where temporary files for samtools sort will " "be stored. If not given, then the samtools default tmp choice will be used.", default=default_tmp) parser.add_argument('--do-not-call', action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) call = not args.do_not_call bam_utils.remove_multimapping_reads(args.align_in, args.align_out, call=call, tmp=args.tmp)
def parse_arguments() -> argparse.Namespace: parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) parser.add_argument('config', help="The path to the yaml configuration " "file.") parser.add_argument('--chunk-size', type=int, default=100, help="The size " "of chunks for parallelization") parser.add_argument( '--num-notes', type=int, default=None, help="The " "number of notes to read in. This is mostly for debugging purposes.") dask_utils.add_dask_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) return args
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script replaces all score and color values in bed files with " "'0'. It applies this to all bed files in the current directory.") parser.add_argument('--no-ask', help="By default, the program will ask to replace " "the values for each bed file. If this flag is given, then the asking will be " "skipped.", action='store_true') parser.add_argument('--bed-extensions', help="The extensions to treat as " "bed files", nargs='+', default=default_bed_extensions) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) ask = not args.no_ask for bed_extension in args.bed_extensions: re = "*{}".format(bed_extension) bed_files = glob.glob(re) for bed_file in bed_files: print("fix: {}".format(bed_file)) if (not ask) or fix_bed(bed_file): bed = bed_utils.read_bed(bed_file) bed['score'] = 0 bed['color'] = 0 bed_utils.write_bed(bed, bed_file)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script counts the number of reads in the given fastq " "(possibly gzipped) files.") parser.add_argument('files', help="A glob-style re giving the filenames", nargs='+') parser.add_argument('-o', '--out', help="A (csv.gz) output file", required=True) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) datasets = [] read_counts = [] for f in args.files: msg = "Processing file: {}".format(f) logger.info(msg) read_count = fastx_utils.get_read_count(f, is_fasta=False) datasets.append(pyllars.utils.get_basename(f)) read_counts.append(read_count) df = pd.DataFrame() df['dataset'] = datasets df['reads'] = read_counts msg = "Writing data frame to disk" logger.info(msg) pd_utils.write_df(df, args.out, index=False)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script splits BED12+ files into a BED6+ file. Each block " "(i.e., exon) in the original file is an individual feature in the new file. " "There are two extra fields, exon_index and transcript_start, which give the " "index of the exon within its transcript and the start of the exon in the " "\"spliced\" version of the transcript. The \"id\" column in the original file " "is used as the \"id\" in the new file, so the exons can easily be grouped.") parser.add_argument('bed', help="The BED12+ file") parser.add_argument('out', help="The output BED6+2 file") parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use", type=int, default=default_num_cpus) parser.add_argument('--num-groups', help="The number of groups to split the " "bed file into for parallelization", type=int, default=default_num_groups) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading BED12+ file" logger.info(msg) bed = bed_utils.read_bed(args.bed) msg = "Splitting blocks" logger.info(msg) exons = parallel.apply_parallel_split( bed, args.num_cpus, #bio.split_bed12_blocks, split_all_blocks, progress_bar=True, num_groups = args.num_groups ) msg = "Merging exons into a data frame" logger.info(msg) #exons = utils.flatten_lists(exons) #exons = pd.DataFrame(exons) exons = pd.concat(exons) fields = bed_utils.bed6_field_names + ['exon_index', 'transcript_start'] exons = exons[fields] msg = "Writing BED6+2 file" logger.info(msg) bed_utils.write_bed(exons, args.out)
def parse_arguments() -> argparse.Namespace: parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__) parser.add_argument('config', help="The path to the yaml configuration " "file.") logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) return args
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Remove duplicate entries from a list of bed12+ files. " "Write the non-redundant entries to a new file. Precedence among " "duplicates is arbitrary.") parser.add_argument('bed', help="The input bed file(s)", nargs='+') parser.add_argument('-o', '--out', help="The output bed(.gz) file", required=True) parser.add_argument('--compress', help="If this flag is given, the output " "will be gzipped. The output filename *will not* be changed (so it " "should already end in \".gz\").", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading bed files" logger.info(msg) all_bed = [ bed_utils.read_bed(b) for b in args.bed ] for f, b in zip(args.bed, all_bed): msg = "{}. number of entities: {}".format(f, len(b)) logger.debug(msg) msg = "Concatenating bed entries" logger.info(msg) all_bed_df = pd.concat(all_bed) msg = "Removing duplicate entries" logger.info(msg) all_bed_df = all_bed_df.drop_duplicates(subset=DUPLICATE_FIELDS) msg = "number of non-redundant entries: {}".format(len(all_bed_df)) logger.debug(msg) msg = "Sorting non-redundant entries" logger.info(msg) sort_fields = ['seqname', 'start', 'end', 'strand'] all_bed_df = all_bed_df.sort_values(by=sort_fields) msg = "Writing sorted, non-redundant entries to disk" logger.info(msg) bed_utils.write_bed(all_bed_df, args.out, compress=args.compress)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script counts the number of uniquely- and multi-mapping " "reads in a list of bam files.") parser.add_argument('files', help="A glob-style re giving the filenames", nargs='+') parser.add_argument('-o', '--out', help="A (csv.gz) output file", required=True) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) datasets= [] aligned_reads = [] uniquely_aligned_reads = [] for f in args.files: msg = "Processing file: {}".format(f) logger.info(msg) num_aligned_reads = bam_utils.count_aligned_reads(f) num_uniquely_aligned_reads = bam_utils.count_uniquely_mapping_reads(f) datasets.append(pyllars.utils.get_basename(f)) aligned_reads.append(num_aligned_reads) uniquely_aligned_reads.append(num_uniquely_aligned_reads) msg = "Constructing data frame" logger.info(msg) df = pd.DataFrame() df['dataset'] = datasets df['aligned_reads'] = aligned_reads df['uniquely_aligned_reads'] = uniquely_aligned_reads df['multimapping_reads'] = df['aligned_reads'] - df['uniquely_aligned_reads'] msg = "Writing data frame to disk" logger.info(msg) pd_utils.write_df(df, args.out, index=False)
def parse_arguments() -> argparse.Namespace: parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=__doc__ ) parser.add_argument('config', help="The path to the yaml configuration " "file.") parser.add_argument('--seed', type=int, default=8675309, help="The seed " "for the random number generator") parser.add_argument('--chunk-size', type=int, default=100, help="The size " "of chunks for parallelization") dask_utils.add_dask_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) return args
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script removes duplicate reads from paired-end fastq files. " "It only treats exact matches as duplicates (i.e., if reads are of different " "lengths, it does not consider exact substring matches as duplicates). The " "duplicate with the highest average quality score is retained.\n\nThis script " "is not designed to work with fasta files.") parser.add_argument('fastq_1', help="The first mate file") parser.add_argument('fastq_2', help="The second mate file") parser.add_argument('out_1', help="The de-duped first mate file") parser.add_argument('out_2', help="The de-duped second mate file") parser.add_argument( '--do-not-compress', help="Unless this flag is given, the " "output will be gzipped. N.B. \".gz\" *will not* be adde to the file names.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) compress = not args.do_not_compress msg = "Counting reads in each file" logger.info(msg) # we will use the counts for displaying a progress bar num_reads_1 = fastx_utils.get_read_count(args.fastq_1, is_fasta=False) num_reads_2 = fastx_utils.get_read_count(args.fastq_2, is_fasta=False) # but avoid errors by making sure the counts match if num_reads_1 != num_reads_2: msg = "The number of reads in the files do not match ({} vs. {})".format( num_reads_1, num_reads_2) raise ValueError(msg) msg = "Creating read iterators" logger.info(msg) fastq_1_iter = fastx_utils.get_read_iterator(args.fastq_1, is_fasta=False) fastq_2_iter = fastx_utils.get_read_iterator(args.fastq_2, is_fasta=False) fastq_iter = zip(fastq_1_iter, fastq_2_iter) msg = "Detecting duplicates" logger.info(msg) seen_reads = {} for r1, r2 in tqdm.tqdm(fastq_iter, total=num_reads_1): r_key = (r1[1], r2[1]) prev_val = seen_reads.get(r_key, None) if prev_val is None: srp = stored_read_pair(r1_name=r1[0], r2_name=r2[0], r1_qual=r1[2], r2_qual=r2[2]) seen_reads[r_key] = srp else: new_qual_score = sum(r1[2].encode()) + sum(r1[2].encode()) prev_qual_score = (sum(prev_val.r1_qual.encode()) + sum(prev_val.r2_qual.encode())) if new_qual_score > prev_qual_score: srp = stored_read_pair(r1_name=r1[0], r2_name=r2[0], r1_qual=r1[2], r2_qual=r2[2]) seen_reads[r_key] = srp msg = "Writing the de-duped files to disk" logger.info(msg) with ExitStack() as stack: out_1 = pyllars.utils.open_file(args.out_1, 'w', compress=compress) out_2 = pyllars.utils.open_file(args.out_2, 'w', compress=compress) for (seqs, srp) in tqdm.tqdm(seen_reads.items()): fastx_utils._write_fastq_entry(out_1, srp.r1_name, seqs[0], srp.r1_qual) fastx_utils._write_fastq_entry(out_2, srp.r2_name, seqs[1], srp.r2_qual)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates a report for a list of Entrez or Entrez " "gene identifiers. In particular, it extracts information from Swiss-Prot, " "TrEMBL, Interpro, PDB, Pfam, PROSITE, the Gene Ontology, and KEGG. It uses " "the mygene.info service to collect the annotations.") parser.add_argument('filename', help="The name of the file") parser.add_argument( 'out', help="The output file. It will contain the same information " "as the input file with the gene annotations appended as new columns. The format " "is the same as the input format.") parser.add_argument( '-f', '--filetype', help="The format of the input file. By default, " "the format will be guessed based on the file extension.", choices=filetype_choices, default=default_filetype) parser.add_argument('--sep', help="The spearator in the file (if csv)", default=default_sep) parser.add_argument( '-s', '--sheet', help="The name of the sheet (for excel files) " "or key (for hdf5 files) from which the gene list is extracted. By default, the " "first sheet in an excel file is used. This argument is not used for csv files.", default=default_sheet) parser.add_argument( '-c', '--column', help="The name of the column (or key in hdf5) " "from which the gene list is extracted. By default, the first column in the " "extracted data frame is used.", default=default_column) parser.add_argument( '--do-not-compress', help="If this flag is present and the file " "is in csv format, then the output will not be compressed. By default, the output " "is compressed.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading the file" logger.info(msg) df = pd_utils.read_df(args.filename, filetype=args.filetype, sheet=args.sheet, sep=args.sep) msg = "Extracting gene identifiers" logger.info(msg) if args.column is None: args.column = df.columns[0] gene_ids = df[args.column] msg = "Pulling information from mygene.info" logger.info(msg) res_df = mygene_utils.query_mygene(gene_ids) msg = "Joining results to original input" logger.info(msg) res_df = df.merge(res_df, left_on=args.column, right_on='gene_id', how='inner') msg = "Writing output" logger.info(msg) pd_utils.write_df(res_df, args.out, filetype=args.filetype, sheet=args.sheet, do_not_compress=args.do_not_compress, index=False) msg = "Finished" logger.info(msg)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script counts the number of unique reads in the " "given files, all of which must be the same type. In the case of bam " "files, it only counts primary alignments (so it does not " "double-count multimappers, and it does not include unmapped reads " "present in the file.") parser.add_argument('files', help="The fasta, fastq or bam files", nargs='+') parser.add_argument('-o', '--out', help="The (csv.gz) output file " "containing the lengths and counts", required=True) parser.add_argument( '-f', '--file-type', help="The type of the files. All " "files must be of the same type. If the \"AUTO\" file type is given, " "then the type will be guessed on the extension of the first file " "using the following heuristic: \"bam\" if the extension is\".bam\" " "or \".sam\"; " "\"fastq\" if the extension is \"fastq\", \"fastq.gz\", \"fq\", or " "\"fq.gz\"; \"fasta\" otherwise.", choices=file_type_choices, default=default_file_type) parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use", type=int, default=default_num_cpus) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) if args.file_type == "AUTO": args.file_type = guess_file_type(args.files[0]) msg = "The guessed file type is: {}".format(args.file_type) logger.info(msg) # grab the correct function pointer get_length_distribution = file_type_get_length_distribution[args.file_type] msg = "Collecting all read length distributions" logger.info(msg) all_length_distribution_dfs = parallel.apply_parallel_iter( args.files, args.num_cpus, get_length_distribution, progress_bar=True) msg = "Combining data frames into one large df" logger.info(msg) length_distribution_df = pd.concat(all_length_distribution_dfs) msg = "Writing counts to disk" logger.info(msg) pd_utils.write_df(length_distribution_df, args.out, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script converts a GTF file to a BED12 file. In particular, " "it creates bed entries based on the exon features and transcript_id field. " "It uses the CDS regions to determine the \"thick_start\" and \"thick_end\" " "features of the BED12 file.") parser.add_argument('gtf', help="The GTF file") parser.add_argument('out', help="The (output) BED12 file") parser.add_argument( '--chr-name-file', help="If this file is given, then the " "bed entries will be sorted according to the order of seqnames in this " "file. Presumably, this is the chrName.txt file from STAR.", default=default_chr_name_file) parser.add_argument('--exon-feature', help="The name of features which are " "treated as exons", default=default_exon_feature) parser.add_argument('--cds-feature', help="The name of features which are " "treated as CDSs", default=default_cds_feature) parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use", type=int, default=default_num_cpus) parser.add_argument('-g', '--num-groups', help="The number of groups to split " "into for parallelization", type=int, default=default_num_groups) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading GTF file" logger.info(msg) gtf = gtf_utils.read_gtf(args.gtf) msg = "Extracting exon and CDS features" logger.info(msg) m_exons = gtf['feature'] == args.exon_feature m_cds = gtf['feature'] == args.cds_feature exons = gtf[m_exons].copy() cds_df = gtf[m_cds].copy() msg = "Extracting CDS transcript ids" logger.info(msg) cds_transcript_ids = parallel.apply_parallel_split( cds_df, args.num_cpus, get_transcript_ids, progress_bar=True, num_groups=args.num_groups) cds_transcript_ids = collection_utils.flatten_lists(cds_transcript_ids) cds_df['transcript_id'] = cds_transcript_ids msg = "Calculating CDS genomic start and end positions" logger.info(msg) cds_groups = cds_df.groupby('transcript_id') # we subtract 1 from start because gtf is 1-based cds_min_starts = cds_groups['start'].min() cds_start_df = pd.DataFrame() cds_start_df['id'] = cds_min_starts.index cds_start_df['cds_start'] = cds_min_starts.values - 1 # we do not subtract 1 from end because bed is "open" on the end cds_max_end = cds_groups['end'].max() cds_end_df = pd.DataFrame() cds_end_df['id'] = cds_max_end.index cds_end_df['cds_end'] = cds_max_end.values msg = "Extracting exon transcript ids" logger.info(msg) exon_transcript_ids = parallel.apply_parallel_split( exons, args.num_cpus, get_transcript_ids, progress_bar=True, num_groups=args.num_groups) exon_transcript_ids = collection_utils.flatten_lists(exon_transcript_ids) exons['transcript_id'] = exon_transcript_ids exons['length'] = exons['end'] - exons['start'] + 1 exons['length'] = exons['length'].astype(str) # store these for sorting later transcript_ids = np.array(exons['transcript_id']) msg = "Combining exons into BED12 entries" logger.info(msg) exons = exons.sort_values('start') exon_groups = exons.groupby('transcript_id') bed12_df = parallel.apply_parallel_groups(exon_groups, args.num_cpus, get_bed12_entry, progress_bar=True) bed12_df = pd.DataFrame(bed12_df) msg = "Joining BED12 entries to CDS information" logger.info(msg) bed12_df = bed12_df.merge(cds_start_df, on='id', how='left') bed12_df = bed12_df.merge(cds_end_df, on='id', how='left') bed12_df = bed12_df.fillna(-1) bed12_df['thick_start'] = bed12_df['cds_start'].astype(int) bed12_df['thick_end'] = bed12_df['cds_end'].astype(int) msg = "Sorting BED12 entries" logger.info(msg) # We will break ties among transcripts by the order they appear # in the GTF file. This is the same way star breaks ties. bed12_df = bed_utils.sort(bed12_df, seqname_order=args.chr_name_file, transcript_ids=transcript_ids) msg = "Writing BED12 to disk" logger.info(msg) bed_utils.write_bed(bed12_df[bed_utils.bed12_field_names], args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script converts the CCDS text files distributed by NCBI to " "valid BED12 files for use with other programs. Please see " "ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/README, \"CCDS.[YearMonthDay].txt\" for " "more information.") parser.add_argument('ccds', help="The CCDS.txt file downloaded from NCBI") parser.add_argument('out', help="The output bed.gz file") parser.add_argument('-i', '--ignore', help="The ccds_status entries to ignore.", default=default_ccds_status_to_ignore, nargs='*') parser.add_argument('-p', '--num-cpus', help="The number of processors to use " "for extracting the exon information", type=int, default=default_num_cpus) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading CCDS file" logger.info(msg) ccds_df = pd.read_csv(args.ccds, sep='\t') msg = "Copying simple values to BED" logger.info(msg) ccds_bed = pd.DataFrame() ccds_bed['seqname'] = ccds_df['#chromosome'] ccds_bed['start'] = ccds_df['cds_from'] ccds_bed['end'] = ccds_df['cds_to'] ccds_bed['id'] = ccds_df['gene'] + ":" + ccds_df['ccds_id'] ccds_bed['score'] = 0 ccds_bed['strand'] = ccds_df['cds_strand'] ccds_bed['thick_start'] = ccds_df['cds_from'] ccds_bed['thick_end'] = ccds_df['cds_to'] ccds_bed['color'] = 0 msg = "Converting CCDS exons into BED12 blocks" logger.info(msg) cds_exon_info = parallel.apply_parallel(ccds_df, args.num_cpus, parse_cds_locations, args, progress_bar=True) cds_exon_info = [cei for cei in cds_exon_info if cei is not None] cds_exon_df = pd.DataFrame(cds_exon_info) msg = "Merging simple values and blocks" logger.info(msg) ccds_bed = ccds_bed.merge(cds_exon_df, on='id') # put the columns in the correct order ccds_bed = ccds_bed[bio.bed12_field_names] msg = "Writing the BED file" logger.info(msg) bed_utils.write_bed(ccds_bed, args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script removes all of the entries from A which overlap " "any of the entries from B. Optionally, some minimum overlap can be " "given, in terms of overlap fraction.") parser.add_argument('bed_a', help="The bed file from which entries will be " "removed") parser.add_argument('bed_b', help="The bed file used to find entries in A " "to remove") parser.add_argument('out', help="The output (bed.gz) file") parser.add_argument( '--min-a-overlap', help="A minimum fraction required " "for overlap of the A entries. \"0\" means \"at least one bp.\"", type=float, default=default_min_a_overlap) parser.add_argument( '--min-b-overlap', help="A minimum fraction required " "for overlap of the B entries. \"0\" means \"at least one bp.\"", type=float, default=default_min_b_overlap) parser.add_argument( '--split', help="If this flag is given, then the bed " "entries in both files will be split. This can be somewhat slow, " "depending on the number of entries in the files.", action='store_true') parser.add_argument( '--exons', help="If the bed entries have already been " "split and the exon bed6+2 file (from split-bed12-blocks program) is " "available, then that can be given with this option. The exons from " "that file will be used for both A and B.", default=None) parser.add_argument('--exons-a', help="As with the --exons argument, but " "these exons will only be used for A", default=None) parser.add_argument('--exons-b', help="As with the --exons argument, but " "these exons will only be used for B", default=None) parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use for " "certain parts of the script", type=int, default=default_num_cpus) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) exons_given = args.exons is not None exons_a_given = args.exons_a is not None exons_b_given = args.exons_b is not None # check if the exons files exist if exons_given and (not os.path.exists(args.exons)): msg = "The exons file does not exist: {}".format(args.exons) raise FileNotFoundError(msg) if exons_a_given and (not os.path.exists(args.exons_a)): msg = "The exons_a file does not exist: {}".format(args.exons_a) raise FileNotFoundError(msg) if exons_b_given and (not os.path.exists(args.exons_b)): msg = "The exons_b file does not exist: {}".format(args.exons_b) raise FileNotFoundError(msg) exons_a_only = exons_a_given and not exons_b_given exons_b_only = not exons_a_given and exons_b_given if exons_a_only or exons_b_only: msg = ("Only one of --exons-a, --exons-b was given. This is valid, " "but please ensure this is the desired behavior.") logger.warning(msg) # make sure we weren't given contradictory flags if args.split and exons_given: msg = "Both --split and --exons were given. Only one of these is allowed." raise ValueError(msg) if exons_given and (exons_a_given or exons_b_given): msg = ( "Both --exons and (--exons-a or --exons-b) were given. --exons " "should not be given with the --exons-a and --exons-b arguments.") raise ValueError(msg) exons = None exons_a = None exons_b = None msg = "Reading bed A" logger.info(msg) bed_a = bed_utils.read_bed(args.bed_a) msg = "Reading bed B" logger.info(msg) bed_b = bed_utils.read_bed(args.bed_b) if args.split: msg = "Splitting bed A" logger.info(msg) exons_a = parallel.apply_parallel_split(bed_a, args.num_cpus, split_all_blocks, progress_bar=True, num_groups=args.num_groups) exons_a = pd.concat(exons_a) msg = "Splitting bed B" logger.info(msg) exons_b = parallel.apply_parallel_split(bed_b, args.num_cpus, split_all_blocks, progress_bar=True, num_groups=args.num_groups) exons_b = pd.concat(exons_b) if exons_given: msg = "Reading exons" logger.info(msg) exons = bed_utils.read_bed(args.exons) if exons_a_given: msg = "Reading A exons" logger.info(msg) exons_a = bed_utils.read_bed(args.exons_a) if exons_b_given: msg = "Reading B exons" logger.info(msg) exons_b = bed_utils.read_bed(args.exons_b) msg = "Finding all A entries which overlap B entries" logger.info(msg) remaining_a_ids = bed_utils.subtract_bed(bed_a, bed_b, min_a_overlap=args.min_a_overlap, min_b_overlap=args.min_b_overlap, exons=exons, exons_a=exons_a, exons_b=exons_b) msg = "Filtering the A entries which had overlaps" logger.info(msg) m_remaining = bed_a['id'].isin(remaining_a_ids) bed_a_remaining = bed_a[m_remaining] msg = "Writing remaining A entries to disk" logger.info(msg) bed_utils.write_bed(bed_a_remaining, args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script merges either the exons or CDS regions of all transcript " "isoforms into a single \"super gene isoform\". It does this based on the given " "GTF feature type and attribute (with defaults \"CDS\" and \"gene_id\", respectively)." ) parser.add_argument('gtf', help="The GTF file") parser.add_argument('out', help="The output (merged) GTF file") parser.add_argument('--feature-type', help="The type of features to merge", default=default_feature_type) parser.add_argument('--group-attribute', help="The attribute by which the features " "will be merged", default=default_group_attribute) parser.add_argument('--id-format-str', help="The python format string to " "use for creating the \"transcript\" identifiers", default=default_id_format_str) parser.add_argument( '--chr-name-file', help="If this file is specified, it will " "be used to determine the seqname sort order. This should be the " "\"chrName.txt\" file created by STAR. If not present, the transcripts " "will be sorted alphabetically (1, 10, 11, 2, ..., KL568162.1, MT, X, Y).", default=default_chr_name_file) parser.add_argument( '--add-exons', help="If this flag is given, then all features will " "be duplicated, but with the feature type \"exon\". Presumably, this should be given " "when \"CDS\" features are merged, and the resulting GTF file will be used by STAR " "(or anything else expecting \"exon\"s).", action='store_true') parser.add_argument( '-g', '--num-groups', help="The number of groups into which to split " "the features. More groups means the progress bar is updated more frequently but incurs " "more overhead because of the parallel calls.", type=int, default=default_num_groups) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Reading GTF file" logger.info(msg) gtf_df = gtf_utils.read_gtf(args.gtf) msg = "Extracting desired features" logger.info(msg) m_feature_type = gtf_df['feature'] == args.feature_type gtf_feature_df = gtf_df[m_feature_type] msg = "Parsing GTF attributes" logger.info(msg) attributes = parallel.apply_parallel_split(gtf_feature_df, args.num_cpus, parse_attributes_group, progress_bar=True, num_groups=args.num_groups) attributes_df = pd.concat(attributes) attributes_df['end'] = attributes_df['end'].astype(int) attributes_df['start'] = attributes_df['start'].astype(int) msg = "Merging isoforms" logger.info(msg) gene_features = attributes_df.groupby(args.group_attribute) merged_genes = parallel.apply_parallel_groups(gene_features, args.num_cpus, merge_gene_group, args.group_attribute, args.id_format_str, progress_bar=True) merged_genes_df = pd.concat(merged_genes) if args.add_exons: merged_exons = merged_genes_df.copy() merged_exons['feature'] = 'exon' merged_genes_df = pd.concat([merged_exons, merged_genes_df]) merged_genes_df['start'] = merged_genes_df['start'].astype(int) # now, sort the merged isoforms # this is a bit of a hack, because it is actually using the sorting routine # for bed data frames # we need a dummy 'id' column for sorting, so just use the attributes merged_genes_df['id'] = merged_genes_df['attributes'] merged_genes_df = bed_utils.sort(merged_genes_df, seqname_order=args.chr_name_file) # last, drop duplicate rows fields = ['seqname', 'source', 'feature', 'start', 'end', 'strand'] merged_genes_df = merged_genes_df.drop_duplicates(subset=fields) gtf_utils.write_gtf(merged_genes_df, args.out, compress=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script splits large chromosomes into several smaller ones. This " "is required to use BedTools for chromosomes with size larger than about 500M. In " "particular, this script splits chromosomal sequences into smaller chunks and " "updates GTF annotations to use the smaller chromosomes.\n\nFor more information, " "see https://groups.google.com/forum/#!topic/bedtools-discuss/t-nQSCxaFGE" ) parser.add_argument('fasta', help="The chromosome sequence file") parser.add_argument('gtf', help="The annotation file") parser.add_argument( 'out', help="The base output files. The script will create the " "files <out>.fa and <out>.gtf.") parser.add_argument('--max-size', help="The largest allowed size (in bp) for a " "chromosome", type=int, default=default_max_size) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Splitting fasta sequences" logging.info(msg) fasta = fastx_utils.get_read_iterator(args.fasta, is_fasta=True) split_fasta = {} for name, seq in fasta: name_split = name.split(" ") split_seqs = [ l for l in iter(partial(StringIO(seq).read, int(args.max_size)), '') ] for i, split_seq in enumerate(split_seqs): n = name_split[0] # a bit of a hack to save the base sequence name name_split[0] = "{}_{}".format(name_split[0], i) split_name = ' '.join(name_split) split_fasta[split_name] = split_seq name_split[0] = n msg = "Writing fasta output file" logging.info(msg) fasta_out = "{}.fa".format(args.out) fastx_utils.write_fasta(split_fasta, fasta_out, compress=False, progress_bar=True) msg = "Reading GTF" logging.info(msg) gtf = gtf_utils.read_gtf(args.gtf) msg = "Updating GTF coordinates" logging.info(msg) # get the split for each feature start_split_num = gtf['start'] // args.max_size end_split_num = gtf['end'] // args.max_size # wrap the coordinates based on the max size gtf['start'] = np.mod(gtf['start'], args.max_size) gtf['end'] = np.mod(gtf['end'], args.max_size) # for the names, we need them as strings without ".0" at the end start_split_num = start_split_num.astype(int) start_split_num = start_split_num.astype(str) end_split_num = end_split_num.astype(int) end_split_num = end_split_num.astype(str) # create the start and end names split_start_seqname = gtf['seqname'] + "_" + start_split_num split_end_seqname = gtf['seqname'] + "_" + end_split_num gtf['start_seqname'] = split_start_seqname gtf['end_seqname'] = split_end_seqname # remove features which span the gaps m_span = gtf['start_seqname'] != gtf['end_seqname'] gtf = gtf[~m_span] num_spanning = sum(m_span) msg = ( "Number of features spanning length boundaries: {}.\n\nThese features " "will be discarded.".format(num_spanning)) logging.warning(msg) # and update the seqnames gtf['seqname'] = gtf['start_seqname'] msg = "Writing GTF output file" logging.info(msg) gtf_out = "{}.gtf".format(args.out) gtf_utils.write_gtf(gtf, gtf_out, compress=False)