コード例 #1
0
def process_aln_stats(path):
    pd = picard.parse(path)
    raw = next(x for x in pd["metrics"]["contents"] if x["CATEGORY"] == "PAIR")

    aln_proper_pairs = (raw["READS_ALIGNED_IN_PAIRS"] -
                        raw["PF_READS_IMPROPER_PAIRS"])

    return {
        "num_aligned_bases":
        raw["PF_ALIGNED_BASES"],
        "num_aligned_reads":
        raw["PF_READS_ALIGNED"],
        "num_aligned_reads_proper_pairs":
        aln_proper_pairs,
        "num_total_reads":
        raw["TOTAL_READS"],
        "pct_adapter":
        raw["PCT_ADAPTER"],
        "pct_aligned_reads_from_total":
        (raw["PF_READS_ALIGNED"] * 100. / raw["TOTAL_READS"]),
        "pct_aligned_reads_proper_pairs":
        (aln_proper_pairs * 100. / raw["PF_READS_ALIGNED"]),
        "pct_chimeras":
        raw["PCT_CHIMERAS"],
        "rate_indel":
        raw["PF_INDEL_RATE"],
        "rate_mismatch":
        raw["PF_MISMATCH_RATE"],
        "strand_balance":
        raw["STRAND_BALANCE"],
    }
コード例 #2
0
def write_aggregated_picard_metrics_by_row(file_names, output_name):
    """Command line entrypoint to parse, aggreagete and write Picard row metrics.
    Parameters
    ----------
    args:
        file_names: array of files. the basename of inputs should be formated
        as 'samplename_qc',such as
        "samplename_qc.alignment_summary_metrics.txt" and "samplename_qc.insert_size_metrics.txt"
        output_name: prefix of output file name without extension.
    Returns
    ----------
        return: 0
        return if the program completes successfully.
    """
    # initial output
    metrics = {}
    d = pd.DataFrame()
    for file_name in file_names:
        cell_id = os.path.basename(file_name).split('_qc')[0]
        metrics[cell_id] = {}
        parsed = picard.parse(file_name)
        class_name = parsed['metrics']['class'].split('.')[2]
        # Alignment metrics return multiple lines,
        # but only output PAIRED-READS/third line
        contents = parsed['metrics']['contents']
        if class_name == "AlignmentSummaryMetrics":
            # parse out PE, R1 and R2. If the reads are unpaired, the contents
            # will be a single dict rather than a list of dicts.
            if isinstance(contents, dict):
                contents = [contents]
            rows = {}
            for m in contents:
                cat = m['CATEGORY']
                rows.update({
                    k + '.' + cat: v
                    for k, v in m.items() if k not in
                    ['SAMPLE', 'LIBRARY', 'READ_GROUP', 'CATEGORY']
                })
        # sometimes(very rare), insertion metrics also return multiple lines
        # results to include TANDEM repeats. but we only output the first line.
        elif class_name == "InsertSizeMetrics":
            # if the element counts is less than 21,
            # it means insertion metrics returns multiple line results.
            if len(contents) < 21:
                rows = contents[0]
            else:
                rows = contents
        else:
            # other metrics(so far) only return one line results.
            rows = contents
        metrics[cell_id].update({
            k: rows[k]
            for k in rows
            if k not in ['SAMPLE', 'LIBRARY', 'READ_GROUP', 'CATEGORY']
        })
        df = pd.DataFrame.from_dict(metrics, orient='columns')
        df.insert(0, 'Class', class_name)
        d = d.append(df)
    d_T = d.T
    d_T.to_csv(output_name + '.csv')
コード例 #3
0
def process_insert_stats(path):
    pd = picard.parse(path)
    raw = pd["metrics"]["contents"]
    # Raw can be a list if there are more than one PAIR_ORIENTATION.
    if isinstance(raw, list):
        raw = next(r for r in raw if r["PAIR_ORIENTATION"] == "FR")
    return {
        "median_absolute_deviation": raw["MEDIAN_ABSOLUTE_DEVIATION"],
        "median_insert_size": raw["MEDIAN_INSERT_SIZE"],
        "min_insert_size": raw["MIN_INSERT_SIZE"],
        "max_insert_size": raw["MAX_INSERT_SIZE"],
    }
コード例 #4
0
ファイル: hisat2.py プロジェクト: orionzhou/robin
def hisat_check(dirw, ilist, olist, diro1, diro2, paired):
    from crimson import picard
    os.chdir(dirw)
    assert op.isfile(ilist), "%s not exist" % ilist
    ary = np.genfromtxt(ilist, names = True, dtype = object, delimiter = "\t")
    cols = list(ary.dtype.names)
    fho = open(olist, "w")
    if paired:
        fho.write("\t".join(cols + ["BAM",
            "Pair", "Pair_Map", "Pair_Orphan", "Pair_Unmap", \
            "Pair_Map_Hq", "Unpair", "Unpair_Map", "Unpair_Map_Hq"])+"\n")
    else:
        fho.write("\t".join(cols + ["BAM",
            "Total", "Mapped", "Mapped_Hq"]) + "\n")
    for row in ary:
        row = [str(x, 'utf-8') for x in list(row)]
        sid = row[0]
        bam = "%s/%s.bam" % (diro1, sid)
        assert op.isfile(bam), "%s not exist" % bam
        fs = "%s/%s.sum.txt" % (diro2, sid)
        rs1 = picard.parse(fs)['metrics']['contents']
        if type(rs1) == dict: rs1 = [rs1]
        rs = { rs1[i]['CATEGORY']: rs1[i] for i in list(range(len(rs1))) }
        if paired:
            f1r, f2r, rc, f1p, f1u, f2p, f2u, rrc, rc1, rc2 = row[1:11]
            pair = rs['FIRST_OF_PAIR']['TOTAL_READS']
            pair_map = rs['FIRST_OF_PAIR']['READS_ALIGNED_IN_PAIRS']
            pair_map1 = rs['FIRST_OF_PAIR']['PF_READS_ALIGNED']
            pair_map_hq1 = rs['FIRST_OF_PAIR']['PF_HQ_ALIGNED_READS']
            pair_map2 = rs['SECOND_OF_PAIR']['PF_READS_ALIGNED']
            pair_map_hq2 = rs['SECOND_OF_PAIR']['PF_HQ_ALIGNED_READS']
            unpair = rs['UNPAIRED']['TOTAL_READS']
            unpair_map = rs['UNPAIRED']['PF_READS_ALIGNED']
            unpair_map_hq = rs['UNPAIRED']['PF_HQ_ALIGNED_READS']
            pair_orphan = pair_map1 + pair_map2 - pair_map * 2
            pair_unmap = pair - pair_map - pair_orphan
            pair_map_hq = int((pair_map_hq1+pair_map_hq2)/2)
            assert pair == int(rrc), "error 1: %d %s" % (pair, rrc)
            assert int(rc1)+int(rc2) == unpair, "error 2"
            stats = map(str, [pair, pair_map, pair_orphan, pair_unmap, \
                    pair_map_hq, unpair, unpair_map, unpair_map_hq])
            fho.write("\t".join(row + [bam] + list(stats)) + "\n")
        else:
            fr, rc, ft, rrc = row[1:5]
            unpair = rs['UNPAIRED']['TOTAL_READS']
            unpair_map = rs['UNPAIRED']['PF_READS_ALIGNED']
            unpair_map_hq = rs['UNPAIRED']['PF_HQ_ALIGNED_READS']
            stats = map(str, [unpair, unpair_map, unpair_map_hq])
            fho.write("\t".join(row + [bam] + list(stats)) + "\n")
コード例 #5
0
ファイル: hisat2.py プロジェクト: shanwai1234/maize
def hisat_check(dirw, ilist, olist, diro1, diro2, paired):
    from crimson import picard
    os.chdir(dirw)
    assert op.isfile(ilist), "%s not exist" % ilist
    ary = np.genfromtxt(ilist, names=True, dtype=object, delimiter="\t")
    cols = list(ary.dtype.names)
    fho = open(olist, "w")
    if paired:
        fho.write("\t".join(cols + ["BAM",
            "Pair", "Pair_Map", "Pair_Orphan", "Pair_Unmap", \
            "Pair_Map_Hq", "Unpair", "Unpair_Map", "Unpair_Map_Hq"])+"\n")
    else:
        fho.write("\t".join(cols + ["BAM", "Total", "Mapped", "Mapped_Hq"]) +
                  "\n")
    for row in ary:
        row = [str(x, 'utf-8') for x in list(row)]
        sid = row[0]
        bam = "%s/%s.bam" % (diro1, sid)
        assert op.isfile(bam), "%s not exist" % bam
        fs = "%s/%s.sum.txt" % (diro2, sid)
        rs1 = picard.parse(fs)['metrics']['contents']
        if type(rs1) == dict: rs1 = [rs1]
        rs = {rs1[i]['CATEGORY']: rs1[i] for i in list(range(len(rs1)))}
        if paired:
            f1r, f2r, rc, f1p, f1u, f2p, f2u, rrc, rc1, rc2 = row[1:11]
            pair = rs['FIRST_OF_PAIR']['TOTAL_READS']
            pair_map = rs['FIRST_OF_PAIR']['READS_ALIGNED_IN_PAIRS']
            pair_map1 = rs['FIRST_OF_PAIR']['PF_READS_ALIGNED']
            pair_map_hq1 = rs['FIRST_OF_PAIR']['PF_HQ_ALIGNED_READS']
            pair_map2 = rs['SECOND_OF_PAIR']['PF_READS_ALIGNED']
            pair_map_hq2 = rs['SECOND_OF_PAIR']['PF_HQ_ALIGNED_READS']
            unpair = rs['UNPAIRED']['TOTAL_READS']
            unpair_map = rs['UNPAIRED']['PF_READS_ALIGNED']
            unpair_map_hq = rs['UNPAIRED']['PF_HQ_ALIGNED_READS']
            pair_orphan = pair_map1 + pair_map2 - pair_map * 2
            pair_unmap = pair - pair_map - pair_orphan
            pair_map_hq = int((pair_map_hq1 + pair_map_hq2) / 2)
            assert pair == int(rrc), "error 1: %d %s" % (pair, rrc)
            assert int(rc1) + int(rc2) == unpair, "error 2"
            stats = map(str, [pair, pair_map, pair_orphan, pair_unmap, \
                    pair_map_hq, unpair, unpair_map, unpair_map_hq])
            fho.write("\t".join(row + [bam] + list(stats)) + "\n")
        else:
            fr, rc, ft, rrc = row[1:5]
            unpair = rs['UNPAIRED']['TOTAL_READS']
            unpair_map = rs['UNPAIRED']['PF_READS_ALIGNED']
            unpair_map_hq = rs['UNPAIRED']['PF_HQ_ALIGNED_READS']
            stats = map(str, [unpair, unpair_map, unpair_map_hq])
            fho.write("\t".join(row + [bam] + list(stats)) + "\n")
コード例 #6
0
def process_rna_stats(path):
    pd = picard.parse(path)
    raw = pd["metrics"]["contents"]
    cov_sort_key = lambda x: x["normalized_position"]
    return {
        "median_3prime_bias":
        raw["MEDIAN_3PRIME_BIAS"],
        "median_5prime_bias":
        raw["MEDIAN_5PRIME_BIAS"],
        "median_5prime_to_3prime_bias":
        raw["MEDIAN_5PRIME_TO_3PRIME_BIAS"],
        "median_cv_coverage":
        raw["MEDIAN_CV_COVERAGE"],
        "num_coding_bases":
        raw["CODING_BASES"],
        "num_intergenic_bases":
        raw["INTERGENIC_BASES"],
        "num_intronic_bases":
        raw["INTRONIC_BASES"],
        "num_mrna_bases":
        raw["CODING_BASES"] + raw["UTR_BASES"],
        "num_ribosomal_bases":
        (raw["RIBOSOMAL_BASES"] if raw["RIBOSOMAL_BASES"] != "" else None),
        "num_total_bases":
        raw["PF_BASES"],
        "num_utr_bases":
        raw["UTR_BASES"],
        "pct_coding_bases":
        raw["PCT_CODING_BASES"],
        "pct_intergenic_bases":
        raw["PCT_INTERGENIC_BASES"],
        "pct_intronic_bases":
        raw["PCT_INTRONIC_BASES"],
        "pct_mrna_bases":
        raw["PCT_MRNA_BASES"],
        "pct_ribosomal_bases":
        (raw["RIBOSOMAL_BASES"] * 100. /
         raw["PF_ALIGNED_BASES"] if raw["RIBOSOMAL_BASES"] != "" else None),
        "pct_utr_bases":
        raw["PCT_UTR_BASES"],
        "normalized_cov": [
            item["All_Reads.normalized_coverage"] for item in sorted(
                pd["histogram"]["contents"], key=cov_sort_key, reverse=False)
        ]
    }
コード例 #7
0
def merge_picard_metrics(files, metric_name):
    """
    piepline output picard QC metrics at sinle cell/sample level.
    This functin is called to merge/aggregate QC metrics by metrics type and then merge multiple QC measurement 
    into single matrix file. In this file, column is sample/cell and row is QC metrics
    :param files: metric files from pipeline outputs
    :param met_name: metrics name with workflow name and subworkflow name as prefix. such as 'run_pipelines.RunStarPipeline.alignment_summary_metrics'
    """
    # set up auth
    client = storage.Client()
    bucket = client.get_bucket('broad-dsde-mint-dev-cromwell-execution')
    # load cromwell credential
    logins = json.load(open('/usr/secrets/broad-dsde-mint-dev-cromwell.json'))
    # initial output
    mets = {}
    for kk in range(0, len(files)):
        fc = files[kk]
        fc = fc.replace('gs://broad-dsde-mint-dev-cromwell-execution/', '')
        blob = bucket.get_blob(fc)
        met_name = basename(fc)
        # sample name is prefix of file name
        sample_name = met_name.split('.')[0]
        with open(met_name, 'wb') as file_obj:
            blob.download_to_file(file_obj)
        # use picard package parse out picard output, a json file is returned
        parsed = picard.parse(met_name)
        class_name = parsed['metrics']['class']
        # Aignment metrics return multiple lines, but only output PAIRED-READS/third line
        if class_name == "picard.analysis.AlignmentSummaryMetrics":
            ## only parse out pair reads
            met = parsed['metrics']['contents'][2]
        # sometimes(very rare), insertion metrics also return multiple lines results to include TANDEM repeats. but we only output the first line.
        elif class_name == "picard.analysis.InsertSizeMetrics":
            # if the elemnet counts is less than 21, it means insertion metrics returns multiple line results.
            if len(parsed['metrics']['contents']) < 21:
                met = parsed['metrics']['contents'][0]
            else:
                met = parsed['metrics']['contents']
        else:
            # other metrics(so far) only return one line results.
            met = parsed['metrics']['contents']
        mets[sample_name] = met
    merged = pd.DataFrame.from_dict(mets)
    return merged
コード例 #8
0
    def add(self, sample_name, input_dir, internal_sample_name):
        metric_line = dict()

        self.add_to_metric_line(metric_line, 'SAMPLE_NAME', sample_name)
        self.add_to_metric_line(metric_line, 'INTERNAL_NAME',
                                internal_sample_name)

        flagstat_metrics = flagstat.parse(input_dir.flagstat_file())
        self.add_flagstat_columns(metric_line, flagstat_metrics)

        alignment_metrics = picard.parse(
            input_dir.picard_alignment_metrics_file())
        self.add_picard_alignment_columns(metric_line, alignment_metrics)

        dup_metrics = picard.parse(
            input_dir.picard_mark_duplicates_metrics_file())
        self.add_picard_markdup_columns(metric_line, dup_metrics)

        ins_metrics = picard.parse(input_dir.picard_insert_size_metrics_file())
        self.add_generic_picard_columns(metric_line, ins_metrics, 'INS')

        wgs_metrics = picard.parse(input_dir.picard_wgs_metrics_file())
        self.add_generic_picard_columns(metric_line, wgs_metrics)

        gc_metrics = picard.parse(input_dir.picard_gc_bias_metrics_file())
        self.add_generic_picard_columns(metric_line, gc_metrics)

        all_vc_metrics = picard.parse(input_dir.all_chrom_vc_detail_metrics())
        self.add_generic_picard_columns(metric_line, all_vc_metrics, 'ALL')

        x_vc_metrics = picard.parse(input_dir.X_chrom_vc_detail_metrics())
        self.add_generic_picard_columns(metric_line, x_vc_metrics, 'CHRX')

        verifybamid_metrics = verifybamid.parse(
            input_dir.verifybamid_self_sample_file())
        self.add_freemix_column(metric_line, verifybamid_metrics)

        self.add_to_metric_line(metric_line, 'HAPLOID_COVERAGE',
                                self.haploid_coverage(metric_line))
        self.add_to_metric_line(metric_line, 'interchromosomal_rate',
                                self.interchromosomal_rate(metric_line))
        self.add_to_metric_line(metric_line, 'discordant_rate',
                                self.discordant_rate(metric_line))

        self.lines.append('\t'.join(
            [str(metric_line[key]) for key in self.header_order]))
コード例 #9
0
def write_aggregated_picard_metrics_by_table(file_names, output_name):
    """Command line entrypoint to parse and write Picard table metrics.
    Parameters
    ----------
    args:
        file_names: array of files.the basename of inputs should be formated as 'samplename_qc'
        output_name: prefix of output file name. the basename of outputs
        includes the Picard metrics class name.
    Returns
    ----------
        return: 0
        return if the program completes successfully.
    """
    for file_name in file_names:
        cell_id = os.path.basename(file_name).split('_qc')[0]
        class_name = os.path.basename(file_name).split('.')[1]
        parsed = picard.parse(file_name)
        dat = pd.DataFrame.from_dict(parsed['metrics']['contents'])
        dat.insert(0, 'Sample', cell_id)
        dat.to_csv(output_name + "_" + class_name + '.csv', index=False)
コード例 #10
0
def get_picard_stat(file):
    all_metr = picard.parse(file)['metrics']['contents']
    if isinstance(all_metr, dict):
        return all_metr
    else:
        return all_metr[-1]