Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script counts the number of reads in the given fastq "
        "(possibly gzipped) files.")

    parser.add_argument('files', help="A glob-style re giving the filenames", 
        nargs='+')

    parser.add_argument('-o', '--out', help="A (csv.gz) output file", required=True)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    datasets = []
    read_counts = []

    for f in args.files:
        msg = "Processing file: {}".format(f)
        logger.info(msg)

        read_count = fastx_utils.get_read_count(f, is_fasta=False)

        datasets.append(pyllars.utils.get_basename(f))
        read_counts.append(read_count)

    df = pd.DataFrame()
    df['dataset'] = datasets
    df['reads'] = read_counts

    msg = "Writing data frame to disk"
    logger.info(msg)

    pd_utils.write_df(df, args.out, index=False)
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script counts the number of uniquely- and multi-mapping "
        "reads in a list of bam files.")

    parser.add_argument('files', help="A glob-style re giving the filenames", 
        nargs='+')

    parser.add_argument('-o', '--out', help="A (csv.gz) output file", required=True)
    
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    datasets= []
    aligned_reads = []
    uniquely_aligned_reads = []

    for f in args.files:
        msg = "Processing file: {}".format(f)
        logger.info(msg)

        num_aligned_reads = bam_utils.count_aligned_reads(f)
        num_uniquely_aligned_reads = bam_utils.count_uniquely_mapping_reads(f)

        datasets.append(pyllars.utils.get_basename(f))
        aligned_reads.append(num_aligned_reads)
        uniquely_aligned_reads.append(num_uniquely_aligned_reads)

    msg = "Constructing data frame"
    logger.info(msg)

    df = pd.DataFrame()
    df['dataset'] = datasets
    df['aligned_reads'] = aligned_reads
    df['uniquely_aligned_reads'] = uniquely_aligned_reads
    df['multimapping_reads'] = df['aligned_reads'] - df['uniquely_aligned_reads']

    msg = "Writing data frame to disk"
    logger.info(msg)

    pd_utils.write_df(df, args.out, index=False)
Esempio n. 3
0
def write_gtf(data_frame, filename, compress=True, use_default_fields=True, **kwargs):
    """ This function formats a data frame such that it will behave as
        a GTF file. In particular, it writes a tab-delimited file and prepends
        the hash mark (#) to the field names in the header so they are treated
        as comments by typical GTF parsers.

        The "start" and "end" features (4th and 5th columns) will be cast
        as integers.
    
        Args:
            data_frame (pandas.DataFrame) : a data frame representing bed objects

            filename (string) : the name of the output file

            compress (bool) : whether to gzip the output

            use_default_fields (bool) : whether to use all of the fields in
                the data frame or only those in gtf_field_names

            kwargs : these are passed through to the write_df function

        Returns:
            None
    """
    do_not_compress = not compress

    if use_default_fields:
        data_frame = data_frame[gtf_field_names]

    start_field = data_frame.columns[3]
    end_field = data_frame.columns[4]

    data_frame[start_field] = data_frame[start_field].astype(int)
    data_frame[end_field] = data_frame[end_field].astype(int)

    header = ['#{}'.format(c) for c in data_frame.columns]
    pd_utils.write_df(data_frame, filename, index=False, sep='\t', 
        header=header, do_not_compress=do_not_compress, quoting=csv.QUOTE_NONE, **kwargs)
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates a report for a list of Entrez or Entrez "
        "gene identifiers. In particular, it extracts information from Swiss-Prot, "
        "TrEMBL, Interpro, PDB, Pfam, PROSITE, the Gene Ontology, and KEGG. It uses "
        "the mygene.info service to collect the annotations.")

    parser.add_argument('filename', help="The name of the file")
    parser.add_argument(
        'out',
        help="The output file. It will contain the same information "
        "as the input file with the gene annotations appended as new columns. The format "
        "is the same as the input format.")

    parser.add_argument(
        '-f',
        '--filetype',
        help="The format of the input file. By default, "
        "the format will be guessed based on the file extension.",
        choices=filetype_choices,
        default=default_filetype)

    parser.add_argument('--sep',
                        help="The spearator in the file (if csv)",
                        default=default_sep)

    parser.add_argument(
        '-s',
        '--sheet',
        help="The name of the sheet (for excel files) "
        "or key (for hdf5 files) from which the gene list is extracted. By default, the "
        "first sheet in an excel file is used. This argument is not used for csv files.",
        default=default_sheet)

    parser.add_argument(
        '-c',
        '--column',
        help="The name of the column (or key in hdf5) "
        "from which the gene list is extracted. By default, the first column in the "
        "extracted data frame is used.",
        default=default_column)

    parser.add_argument(
        '--do-not-compress',
        help="If this flag is present and the file "
        "is in csv format, then the output will not be compressed. By default, the output "
        "is compressed.",
        action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading the file"
    logger.info(msg)

    df = pd_utils.read_df(args.filename,
                          filetype=args.filetype,
                          sheet=args.sheet,
                          sep=args.sep)

    msg = "Extracting gene identifiers"
    logger.info(msg)

    if args.column is None:
        args.column = df.columns[0]

    gene_ids = df[args.column]

    msg = "Pulling information from mygene.info"
    logger.info(msg)

    res_df = mygene_utils.query_mygene(gene_ids)

    msg = "Joining results to original input"
    logger.info(msg)

    res_df = df.merge(res_df,
                      left_on=args.column,
                      right_on='gene_id',
                      how='inner')

    msg = "Writing output"
    logger.info(msg)

    pd_utils.write_df(res_df,
                      args.out,
                      filetype=args.filetype,
                      sheet=args.sheet,
                      do_not_compress=args.do_not_compress,
                      index=False)

    msg = "Finished"
    logger.info(msg)
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script counts the number of unique reads in the "
        "given files, all of which must be the same type. In the case of bam "
        "files, it only counts primary alignments (so it does not "
        "double-count multimappers, and it does not include unmapped reads "
        "present in the file.")

    parser.add_argument('files',
                        help="The fasta, fastq or bam files",
                        nargs='+')
    parser.add_argument('-o',
                        '--out',
                        help="The (csv.gz) output file "
                        "containing the lengths and counts",
                        required=True)

    parser.add_argument(
        '-f',
        '--file-type',
        help="The type of the files. All "
        "files must be of the same type. If the \"AUTO\" file type is given, "
        "then the type will be guessed on the extension of the first file "
        "using the following heuristic: \"bam\" if the extension is\".bam\" "
        "or \".sam\"; "
        "\"fastq\" if the extension is \"fastq\", \"fastq.gz\", \"fq\", or "
        "\"fq.gz\"; \"fasta\" otherwise.",
        choices=file_type_choices,
        default=default_file_type)

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use",
                        type=int,
                        default=default_num_cpus)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.file_type == "AUTO":
        args.file_type = guess_file_type(args.files[0])
        msg = "The guessed file type is: {}".format(args.file_type)
        logger.info(msg)

    # grab the correct function pointer
    get_length_distribution = file_type_get_length_distribution[args.file_type]

    msg = "Collecting all read length distributions"
    logger.info(msg)

    all_length_distribution_dfs = parallel.apply_parallel_iter(
        args.files, args.num_cpus, get_length_distribution, progress_bar=True)

    msg = "Combining data frames into one large df"
    logger.info(msg)
    length_distribution_df = pd.concat(all_length_distribution_dfs)

    msg = "Writing counts to disk"
    logger.info(msg)

    pd_utils.write_df(length_distribution_df, args.out, index=False)