Exemple #1
0
def main(args):
    remove_bam_from_end_re = re.compile("\.bam$")
    bam_root = remove_bam_from_end_re.sub("", os.path.basename(args.anno_bam))
    with tarfile.open(args.rsem_index, "r:gz") as archive:
        archive.extractall(".",
                           members=make_modified_TarInfo(
                               archive, "rsem_index"))
    rsem_call = shlex.split(
        RSEM_COMMAND.format(
            rnd_seed=args.rnd_seed,
            ncpus=args.ncpus,
            ramGB=args.ramGB,
            fwd_prob=strand_to_fwd_prob(args.read_strand),
            paired_end=format_endedness(args.endedness),
            anno_bam=args.anno_bam,
            bam_root=bam_root,
        ))
    logger.info("Running RSEM command %s", " ".join(rsem_call))
    subprocess.call(rsem_call)
    gene_quant_fn = str(bam_root) + "_rsem.genes.results"
    number_of_genes_detected = calculate_number_of_genes_detected(
        gene_quant_fn)
    number_of_genes_detected_dict = {
        "number_of_genes_detected": number_of_genes_detected
    }
    qc_record = QCMetricRecord()
    number_of_genes_QC = QCMetric("number_of_genes_detected",
                                  number_of_genes_detected_dict)
    qc_record.add(number_of_genes_QC)

    with open(str(bam_root) + "_number_of_genes_detected.json", "w") as f:
        json.dump(qc_record.to_ordered_dict(), f)
def main(args):
    quant1 = pd.read_csv(args.quants[0], sep="\t", header=None, skiprows=4)
    quant2 = pd.read_csv(args.quants[1], sep="\t", header=None, skiprows=4)
    spearman_correlation = quant1[1].corr(quant2[1], method="spearman")
    qc_record = QCMetricRecord()
    spearman_metric = QCMetric("spearman_correlation",
                               {"spearman_correlation": spearman_correlation})
    qc_record.add(spearman_metric)
    with open(args.output_filename, "w") as fp:
        json.dump(qc_record.to_ordered_dict(), fp)
def main(args):
    logger.info("Reading input tsv: %s" % args.quants)
    quants_tsv = pd.read_csv(args.quants, sep="\t", header=None, skiprows=4)
    # calculate number of mirnas expressed at cpm>2
    per_million = quants_tsv[1].sum() / 1000000
    quants_tsv["cpm"] = quants_tsv[1] / per_million
    cpm_gte2 = sum(quants_tsv["cpm"] >= 2)
    star_qc_record = QCMetricRecord()
    cpm_metric = QCMetric("expressed_mirnas", {"expressed_mirnas": cpm_gte2})
    # get metrics from star log
    star_qc = QCMetric("star_qc_metric", args.star_log, parse_starlog)
    star_qc_record.add_all([cpm_metric, star_qc])
    # calculate number of reads (unique + multimapping)
    reads_mapped = int(star_qc.content["Uniquely mapped reads number"]) + int(
        star_qc.content["Number of reads mapped to multiple loci"]
    )
    reads_mapped_qc = QCMetric("aligned_reads", {"aligned_reads": reads_mapped})
    star_qc_record.add(reads_mapped_qc)
    logger.info("Writing output json %s" % args.output_filename)
    with open(args.output_filename, "w") as fp:
        json.dump(star_qc_record.to_ordered_dict(), fp)
Exemple #4
0
def main(args):
    qc_record = QCMetricRecord()
    logger.info(
        "Reading transcript id to gene type mapping from %s",
        args.tr_id_to_gene_type_tsv,
    )
    tr_to_gene_type_map = read_dict_from_tsv(args.tr_id_to_gene_type_tsv)
    logger.info("Calculating gene type counts for bam %s", args.input_bam)
    gene_type_counts = get_gene_type_counts(tr_to_gene_type_map, args.input_bam)
    gene_type_counts = QCMetric("gene_type_count", gene_type_counts)
    qc_record.add(gene_type_counts)
    logger.info("Writing QC output into %s", args.output_filename)
    with open(args.output_filename, "wt") as fp:
        json.dump(qc_record.to_ordered_dict(), fp)
def main(args):
    abundance = pd.read_csv(args.abundance, sep="\t")
    abundance_filtered = filter_startswith_prefix(
        remove_genomic_transcripts(abundance), args.idprefix)
    gene_counts = calculate_abundances_aggregated_by_gene(
        abundance_filtered, args.counts_colname)
    number_of_genes_detected = sum(gene_counts >= 1)
    number_of_genes_record = QCMetricRecord()
    number_of_genes_metric = QCMetric(
        "number_of_genes_detected",
        {"number_of_genes_detected": number_of_genes_detected},
    )
    number_of_genes_record.add(number_of_genes_metric)
    with open(args.outfile, "w") as fp:
        json.dump(number_of_genes_record.to_ordered_dict(), fp)
def main(args):
    rep1_abundance = pd.read_csv(args.rep1_abundance, sep="\t")
    rep2_abundance = pd.read_csv(args.rep2_abundance, sep="\t")
    rep1_filtered = filter_startswith_prefix(
        remove_genomic_transcripts(rep1_abundance), args.rep1_idprefix)
    rep2_filtered = filter_startswith_prefix(
        remove_genomic_transcripts(rep2_abundance), args.rep2_idprefix)
    del rep1_abundance
    del rep2_abundance
    rep1_counts = calculate_abundances_aggregated_by_gene(
        rep1_filtered, rep1_filtered.columns[-1])
    rep2_counts = calculate_abundances_aggregated_by_gene(
        rep2_filtered, rep2_filtered.columns[-1])
    del rep1_filtered
    del rep2_filtered
    aligned_counts = rep1_counts.align(rep2_counts, join="outer", fill_value=0)
    spearman = aligned_counts[0].corr(aligned_counts[1], method="spearman")
    correlation_qc = QCMetric("replicates_correlation",
                              {"spearman_correlation": spearman})
    spearman_record = QCMetricRecord([correlation_qc])
    with open(args.outfile, "w") as fp:
        json.dump(spearman_record.to_ordered_dict(), fp)
Exemple #7
0
def test_get_content():
    qc_obj = QCMetric("_", {2: "a", 1: "b"})
    assert qc_obj.content == OrderedDict([(1, "b"), (2, "a")])
Exemple #8
0
def obj_a1():
    return QCMetric("a", {1: 2})
Exemple #9
0
def obj_a2():
    return QCMetric("a", {2: 3})
Exemple #10
0
def main(args):
    merged_R2 = None
    if len(args.fastqs_R1) > 1:
        merged_R1 = concatenate_files(args.fastqs_R1)
    else:
        merged_R1 = args.fastqs_R1[0]
    if args.endedness == "paired" and len(args.fastqs_R2) > 1:
        merged_R2 = concatenate_files(args.fastqs_R2)
    elif args.endedness == "paired" and len(args.fastqs_R2) == 1:
        merged_R2 = args.fastqs_R2[0]
    fastqs = [merged_R1]

    if merged_R2 and args.endedness == "paired":
        fastqs.append(merged_R2)
    with tarfile.open(args.index, "r:gz") as archive:
        archive.extractall()
    aligner = make_aligner(args.endedness, fastqs, args.ncpus, args.ramGB,
                           args.indexdir)
    aligner.run()
    cwd = os.getcwd()
    genome_bam_path = os.path.join(cwd, args.bamroot + "_genome.bam")
    anno_bam_path = os.path.join(cwd, args.bamroot + "_anno.bam")
    genome_flagstat_path = os.path.join(cwd,
                                        args.bamroot + "_genome_flagstat.txt")
    anno_flagstat_path = os.path.join(cwd, args.bamroot + "_anno_flagstat.txt")
    star_log_path = os.path.join(cwd, args.bamroot + "_Log.final.out")
    os.rename(os.path.join(cwd, "Aligned.sortedByCoord.out.bam"),
              genome_bam_path)
    os.rename(os.path.join(cwd, "Log.final.out"), star_log_path)
    rsem_check_cmd = "rsem-sam-validator {bam_to_check}".format(
        bam_to_check="Aligned.toTranscriptome.out.bam")
    rsem_output = subprocess.check_output(shlex.split(rsem_check_cmd))
    # rsem validator exits with 0 whether the check passes or not
    # for this reason we check if the output ends in 'is valid!'
    # the other possibility is 'is not valid!'
    rsem_valid = rsem_output.decode().strip().split("\n")[-1].endswith(
        "is valid!")
    if rsem_valid:
        logger.info("Transcriptome bam is already rsem-sorted.")
        os.rename(os.path.join(cwd, "Aligned.toTranscriptome.out.bam"),
                  anno_bam_path)
    else:
        logger.info("Transcriptome bam is not rsem-sorted.")
        rsem_sort_cmd = "convert-sam-for-rsem {input} {output}".format(
            input="Aligned.toTranscriptome.out.bam",
            output=args.bamroot + "_anno")
        logger.info("Running %s", rsem_sort_cmd)
        subprocess.call(shlex.split(rsem_sort_cmd))
    get_flagstats(genome_bam_path, genome_flagstat_path)
    get_flagstats(anno_bam_path, anno_flagstat_path)
    anno_flagstat_content = parse_flagstats(anno_flagstat_path)
    genome_flagstat_content = parse_flagstats(genome_flagstat_path)
    star_log_content = parse_starlog(star_log_path)
    anno_flagstat_qc = QCMetric("samtools_anno_flagstat",
                                anno_flagstat_content)
    genome_flagstat_qc = QCMetric("samtools_genome_flagstat",
                                  genome_flagstat_content)
    star_log_qc = QCMetric("star_log_qc", star_log_content)
    write_json(
        anno_flagstat_qc.to_ordered_dict(),
        re.sub(r"\.txt$", ".json", anno_flagstat_path),
    )
    write_json(
        genome_flagstat_qc.to_ordered_dict(),
        re.sub(r"\.txt$", ".json", genome_flagstat_path),
    )
    write_json(star_log_qc.to_ordered_dict(),
               re.sub(r"\.out$", ".json", star_log_path))
Exemple #11
0
def test_equals():
    first_obj = QCMetric("a", {})
    second_obj = QCMetric("a", {"x": "y"})
    assert first_obj == second_obj
Exemple #12
0
def test_QCMetric_repr():
    obj = QCMetric("a", {1: "x"})
    assert obj.__repr__() == "QCMetric('a', OrderedDict([(1, 'x')]))"
Exemple #13
0
def test_len_0():
    assert len(QCMetric("a", {})) == 0
Exemple #14
0
def test_less_than():
    smaller_obj = QCMetric(1, {})
    bigger_obj = QCMetric(2, {})
    assert smaller_obj < bigger_obj
Exemple #15
0
def obj_b():
    return QCMetric("b", {3: 4})
Exemple #16
0
def test_get_name():
    qc_obj = QCMetric("a", {})
    assert qc_obj.name == "a"
Exemple #17
0
def test_type_check():
    with pytest.raises(TypeError):
        QCMetric("name", 1)
Exemple #18
0
def obj_d():
    return QCMetric("d", {"a": "b"})
Exemple #19
0
def obj_c():
    return QCMetric("c", {1: 2, 3: 4, 5: 6})