def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script counts the number of reads in the given fastq " "(possibly gzipped) files.") parser.add_argument('files', help="A glob-style re giving the filenames", nargs='+') parser.add_argument('-o', '--out', help="A (csv.gz) output file", required=True) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) datasets = [] read_counts = [] for f in args.files: msg = "Processing file: {}".format(f) logger.info(msg) read_count = fastx_utils.get_read_count(f, is_fasta=False) datasets.append(pyllars.utils.get_basename(f)) read_counts.append(read_count) df = pd.DataFrame() df['dataset'] = datasets df['reads'] = read_counts msg = "Writing data frame to disk" logger.info(msg) pd_utils.write_df(df, args.out, index=False)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script counts the number of uniquely- and multi-mapping " "reads in a list of bam files.") parser.add_argument('files', help="A glob-style re giving the filenames", nargs='+') parser.add_argument('-o', '--out', help="A (csv.gz) output file", required=True) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) datasets= [] aligned_reads = [] uniquely_aligned_reads = [] for f in args.files: msg = "Processing file: {}".format(f) logger.info(msg) num_aligned_reads = bam_utils.count_aligned_reads(f) num_uniquely_aligned_reads = bam_utils.count_uniquely_mapping_reads(f) datasets.append(pyllars.utils.get_basename(f)) aligned_reads.append(num_aligned_reads) uniquely_aligned_reads.append(num_uniquely_aligned_reads) msg = "Constructing data frame" logger.info(msg) df = pd.DataFrame() df['dataset'] = datasets df['aligned_reads'] = aligned_reads df['uniquely_aligned_reads'] = uniquely_aligned_reads df['multimapping_reads'] = df['aligned_reads'] - df['uniquely_aligned_reads'] msg = "Writing data frame to disk" logger.info(msg) pd_utils.write_df(df, args.out, index=False)
def write_gtf(data_frame, filename, compress=True, use_default_fields=True, **kwargs): """ This function formats a data frame such that it will behave as a GTF file. In particular, it writes a tab-delimited file and prepends the hash mark (#) to the field names in the header so they are treated as comments by typical GTF parsers. The "start" and "end" features (4th and 5th columns) will be cast as integers. Args: data_frame (pandas.DataFrame) : a data frame representing bed objects filename (string) : the name of the output file compress (bool) : whether to gzip the output use_default_fields (bool) : whether to use all of the fields in the data frame or only those in gtf_field_names kwargs : these are passed through to the write_df function Returns: None """ do_not_compress = not compress if use_default_fields: data_frame = data_frame[gtf_field_names] start_field = data_frame.columns[3] end_field = data_frame.columns[4] data_frame[start_field] = data_frame[start_field].astype(int) data_frame[end_field] = data_frame[end_field].astype(int) header = ['#{}'.format(c) for c in data_frame.columns] pd_utils.write_df(data_frame, filename, index=False, sep='\t', header=header, do_not_compress=do_not_compress, quoting=csv.QUOTE_NONE, **kwargs)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates a report for a list of Entrez or Entrez " "gene identifiers. In particular, it extracts information from Swiss-Prot, " "TrEMBL, Interpro, PDB, Pfam, PROSITE, the Gene Ontology, and KEGG. It uses " "the mygene.info service to collect the annotations.") parser.add_argument('filename', help="The name of the file") parser.add_argument( 'out', help="The output file. It will contain the same information " "as the input file with the gene annotations appended as new columns. The format " "is the same as the input format.") parser.add_argument( '-f', '--filetype', help="The format of the input file. By default, " "the format will be guessed based on the file extension.", choices=filetype_choices, default=default_filetype) parser.add_argument('--sep', help="The spearator in the file (if csv)", default=default_sep) parser.add_argument( '-s', '--sheet', help="The name of the sheet (for excel files) " "or key (for hdf5 files) from which the gene list is extracted. By default, the " "first sheet in an excel file is used. This argument is not used for csv files.", default=default_sheet) parser.add_argument( '-c', '--column', help="The name of the column (or key in hdf5) " "from which the gene list is extracted. By default, the first column in the " "extracted data frame is used.", default=default_column) parser.add_argument( '--do-not-compress', help="If this flag is present and the file " "is in csv format, then the output will not be compressed. By default, the output " "is compressed.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading the file" logger.info(msg) df = pd_utils.read_df(args.filename, filetype=args.filetype, sheet=args.sheet, sep=args.sep) msg = "Extracting gene identifiers" logger.info(msg) if args.column is None: args.column = df.columns[0] gene_ids = df[args.column] msg = "Pulling information from mygene.info" logger.info(msg) res_df = mygene_utils.query_mygene(gene_ids) msg = "Joining results to original input" logger.info(msg) res_df = df.merge(res_df, left_on=args.column, right_on='gene_id', how='inner') msg = "Writing output" logger.info(msg) pd_utils.write_df(res_df, args.out, filetype=args.filetype, sheet=args.sheet, do_not_compress=args.do_not_compress, index=False) msg = "Finished" logger.info(msg)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script counts the number of unique reads in the " "given files, all of which must be the same type. In the case of bam " "files, it only counts primary alignments (so it does not " "double-count multimappers, and it does not include unmapped reads " "present in the file.") parser.add_argument('files', help="The fasta, fastq or bam files", nargs='+') parser.add_argument('-o', '--out', help="The (csv.gz) output file " "containing the lengths and counts", required=True) parser.add_argument( '-f', '--file-type', help="The type of the files. All " "files must be of the same type. If the \"AUTO\" file type is given, " "then the type will be guessed on the extension of the first file " "using the following heuristic: \"bam\" if the extension is\".bam\" " "or \".sam\"; " "\"fastq\" if the extension is \"fastq\", \"fastq.gz\", \"fq\", or " "\"fq.gz\"; \"fasta\" otherwise.", choices=file_type_choices, default=default_file_type) parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use", type=int, default=default_num_cpus) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) if args.file_type == "AUTO": args.file_type = guess_file_type(args.files[0]) msg = "The guessed file type is: {}".format(args.file_type) logger.info(msg) # grab the correct function pointer get_length_distribution = file_type_get_length_distribution[args.file_type] msg = "Collecting all read length distributions" logger.info(msg) all_length_distribution_dfs = parallel.apply_parallel_iter( args.files, args.num_cpus, get_length_distribution, progress_bar=True) msg = "Combining data frames into one large df" logger.info(msg) length_distribution_df = pd.concat(all_length_distribution_dfs) msg = "Writing counts to disk" logger.info(msg) pd_utils.write_df(length_distribution_df, args.out, index=False)