Ejemplo n.º 1
0
def check_vcf_gq_field(
    vcf_path: Union[Path, str],
) -> None:
    """Check vcf file for GQ field"""
    vcf_file = VCF(vcf_path, threads=settings.cyvcf_threads)
    if not vcf_file.contains("GQ"):
        raise VCFParserError(f"GQ not found in {vcf_path}")
Ejemplo n.º 2
0
def process_vep_scores(vcf_file, is_clinvar=False):
    indexes, scores, absent = OrderedDict(), defaultdict(list), []
    vcf_data = VCF(vcf_file)
    if vcf_data.contains("ANN") or vcf_data.contains("CSQ"):
        spliceai_scores_fromtool=False
        for field in vcf_data.header_iter():
            if field["HeaderType"] == "INFO" and field["ID"] == "ANN":
                tools = field["Description"].split("Format:")[1][:-1].strip().split("|")
                for plugin in vep_plugins:
                    if plugin in tools:
                        indexes[plugin] = tools.index(plugin)
                    else:
                        absent.append(plugin)
            elif field["HeaderType"] == "INFO" and field["ID"] == "SpliceAI":
                spliceai_scores_fromtool=True

        for record in vcf_data:
            key = record.ID + "_" + str(record.POS)
            scores[key].extend([record.CHROM,record.POS, record.REF, record.ALT, record.ID, record.var_type, record.var_subtype])
            annotation = record.INFO.get("ANN")
            if annotation:
                info = annotation.split(",")[0].split("|")

                try:
                    scores[key].append(info[tools.index('Existing_variation')])
                    scores[key].append(info[tools.index('HGVSc')])
                    scores[key].append(info[tools.index('Consequence')])
                except ValueError:
                    pass
                if len(absent) > 0:
                    scores[key].append(("GERP", record.INFO.get("GERP")))
                    scores[key].append(("phyloP", record.INFO.get("phyloP")))
                    scores[key].append(("phastCons", record.INFO.get("phastCons")))
                    scores[key].append(("SiPhy", record.INFO.get("SiPhy")))
                    scores[key].append(("fitcons", record.INFO.get("fitcons")))
                    scores[key].append(("LINSIGHT", record.INFO.get("linsight_g")))
                    scores[key].append(("ReMM", record.INFO.get("ReMM")))
                    scores[key].append(("DANN", record.INFO.get("DANN")))
                    scores[key].append(("GWAVA", record.INFO.get("GWAVA")))
                    scores[key].append(("FATHMM-MKL", record.INFO.get("fathmmMKL")))
                    scores[key].append(("Eigen", record.INFO.get("Eigen")))
                    scores[key].append(("Eigen-PC", record.INFO.get("Eigen-PC")))
                    scores[key].append(("funseq2", record.INFO.get("funseq2")))
                    scores[key].append(("dpsi_zscore", record.INFO.get("dpsi_zscore")))
                    scores[key].append(("traP", record.INFO.get("traP")))
                    if spliceai_scores_fromtool:
                        scores[key].append(("SpliceAI", record.INFO.get("SpliceAI")))
                    else:
                        scores[key].append(("SpliceAI", format_spliceai_fields(record)))

                for pl, i in indexes.items():
                    scores[key].append((pl, info[i]))

            if is_clinvar:
                scores[key].append(('CLNREVSTAT', record.INFO.get("CLNREVSTAT")))
                scores[key].append(('CLNSIG', record.INFO.get("CLNSIG")))


    else:
        print("Program requires ANN field to be present in the INFO field of the input VCF file.\n")
        exit(1)

    for k, v in scores.items():
        [val[1] for val in v if isinstance(val,tuple) and val[0] == "SpliceAI"]

    vcf_data.close()
    return scores
Ejemplo n.º 3
0
def extend_vcf_annotations(query_vcf, pcgr_db_directory, pcgr_predispose):
    """
   Function that reads VEP/vcfanno-annotated VCF and extends the VCF INFO column with tags from
   1. CSQ elements within the primary transcript consequence picked by VEP, e.g. SYMBOL, Feature, Gene, Consequence etc.
   2. Cancer-relevant gene annotations, e.g. known oncogenes/tumor suppressors, known antineoplastic drugs interacting with a given protein etc.
   3. Protein-relevant annotations, e.g. cancer hotspot mutations, functional protein features etc.
   4. Variant effect predictions
   """

    ## read VEP and PCGR tags to be appended to VCF file
    pcgr_vcf_infotags_meta = pcgrutils.read_infotag_file(
        os.path.join(pcgr_db_directory, 'pcgr_infotags.tsv'))
    if pcgr_predispose is True:
        pcgr_vcf_infotags_meta = pcgrutils.read_infotag_file(
            os.path.join(pcgr_db_directory,
                         'pcgr_infotags_predisposition.tsv'))

    out_vcf = re.sub(r'\.vcf(\.gz){0,}$', '.annotated.vcf', query_vcf)

    vep_to_pcgr_af = {
        'gnomAD_AMR_AF': 'AMR_AF_GNOMAD',
        'gnomAD_AFR_AF': 'AFR_AF_GNOMAD',
        'gnomAD_EAS_AF': 'EAS_AF_GNOMAD',
        'gnomAD_NFE_AF': 'NFE_AF_GNOMAD',
        'gnomAD_AF': 'GLOBAL_AF_GNOMAD',
        'gnomAD_SAS_AF': 'SAS_AF_GNOMAD',
        'gnomAD_OTH_AF': 'OTH_AF_GNOMAD',
        'gnomAD_ASJ_AF': 'ASJ_AF_GNOMAD',
        'gnomAD_FIN_AF': 'FIN_AF_GNOMAD',
        'AFR_AF': 'AFR_AF_1KG',
        'AMR_AF': 'AMR_AF_1KG',
        'SAS_AF': 'SAS_AF_1KG',
        'EUR_AF': 'EUR_AF_1KG',
        'EAS_AF': 'EAS_AF_1KG',
        'AF': 'GLOBAL_AF_1KG'
    }

    vcf = VCF(query_vcf)
    vep_csq_index2fields = {}
    vep_csq_fields2index = {}
    dbnsfp_prediction_algorithms = []
    effect_predictions_description = ""
    for e in vcf.header_iter():
        header_element = e.info()
        if 'ID' in header_element.keys():
            identifier = str(header_element['ID'])
            if identifier == 'CSQ' or identifier == 'DBNSFP':
                description = str(header_element['Description'])
                if 'Format: ' in description:
                    subtags = description.split('Format: ')[1].split('|')
                    if identifier == 'CSQ':
                        i = 0
                        for t in subtags:
                            v = t
                            if t in vep_to_pcgr_af:
                                v = str(vep_to_pcgr_af[t])
                            if v in pcgr_vcf_infotags_meta:
                                vep_csq_index2fields[i] = v
                                vep_csq_fields2index[v] = i
                            i = i + 1
                    if identifier == 'DBNSFP':
                        if len(subtags) > 7:
                            effect_predictions_description = "Format: " + '|'.join(
                                subtags[7:])
                        i = 7
                        while (i < len(subtags)):
                            dbnsfp_prediction_algorithms.append(
                                str(
                                    re.sub(r'((_score)|(_pred))"*$', '',
                                           subtags[i])))
                            i = i + 1

    for tag in pcgr_vcf_infotags_meta:
        if not vcf.contains(tag):
            vcf.add_info_to_header({
                'ID':
                tag,
                'Description':
                str(pcgr_vcf_infotags_meta[tag]['description']),
                'Type':
                str(pcgr_vcf_infotags_meta[tag]['type']),
                'Number':
                str(pcgr_vcf_infotags_meta[tag]['number'])
            })

    w = Writer(out_vcf, vcf)
    current_chrom = None
    num_chromosome_records_processed = 0
    pcgr_onco_xref_map = {
        'SYMBOL': 1,
        'ENTREZ_ID': 2,
        'UNIPROT_ID': 3,
        'APPRIS': 4,
        'UNIPROT_ACC': 5,
        'CHORUM_ID': 6,
        'TUMOR_SUPPRESSOR': 7,
        'ONCOGENE': 8,
        'NETWORK_CG': 9,
        'DISGENET_CUI': 10,
        'CHEMBL_COMPOUND_ID': 11,
        'INTOGEN_DRIVER': 12,
        'ONCOSCORE': 13,
        'CANCER_PREDISPOSITION': 14
    }
    for rec in vcf:
        all_transcript_consequences = []
        if current_chrom is None:
            current_chrom = str(rec.CHROM)
            num_chromosome_records_processed = 0
        else:
            if str(rec.CHROM) != current_chrom:
                logger.info(
                    'Completed summary of functional annotations for ' +
                    str(num_chromosome_records_processed) +
                    ' variants on chromosome ' + str(current_chrom))
                current_chrom = str(rec.CHROM)
                num_chromosome_records_processed = 0
        if rec.INFO.get('CSQ') is None:
            alt_allele = ','.join(rec.ALT)
            pos = rec.start + 1
            variant_id = 'g.' + str(rec.CHROM) + ':' + str(pos) + str(
                rec.REF) + '>' + alt_allele
            logger.warning(
                'Variant record ' + str(variant_id) +
                ' does not have CSQ tag from Variant Effect Predictor (vep_skip_intergenic in config set to true?)  - variant will be skipped'
            )
            continue
        pcgr_onco_xref = {}
        num_chromosome_records_processed += 1
        if not rec.INFO.get('PCGR_ONCO_XREF') is None:
            for transcript_onco_xref in rec.INFO.get('PCGR_ONCO_XREF').split(
                    ','):
                xrefs = transcript_onco_xref.split('|')
                ensembl_transcript_id = str(xrefs[0])
                pcgr_onco_xref[ensembl_transcript_id] = {}
                for annotation in pcgr_onco_xref_map.keys():
                    annotation_index = pcgr_onco_xref_map[annotation]
                    if annotation_index > (len(xrefs) - 1):
                        continue
                    if xrefs[annotation_index] != '':
                        pcgr_onco_xref[ensembl_transcript_id][
                            annotation] = xrefs[annotation_index]
        for identifier in ['CSQ', 'DBNSFP']:
            if identifier == 'CSQ':
                num_picks = 0
                for csq in rec.INFO.get(identifier).split(','):
                    csq_fields = csq.split('|')
                    if csq_fields[vep_csq_fields2index[
                            'PICK']] == "1":  ## only consider the primary/picked consequence when expanding with annotation tags
                        num_picks += 1
                        j = 0
                        ## loop over all CSQ elements and set them in the vep_info_tags dictionary (for each alt_allele)
                        while (j < len(csq_fields)):
                            if j in vep_csq_index2fields:
                                if csq_fields[j] != '':
                                    rec.INFO[vep_csq_index2fields[j]] = str(
                                        csq_fields[j])
                                    if vep_csq_index2fields[j] == 'Feature':
                                        ensembl_transcript_id = str(
                                            csq_fields[j])
                                        if ensembl_transcript_id in pcgr_onco_xref:
                                            for annotation in pcgr_onco_xref_map.keys(
                                            ):
                                                if annotation == 'CHORUM_ID' or annotation == 'UNIPROT_ACC':
                                                    continue
                                                if annotation in pcgr_onco_xref[
                                                        ensembl_transcript_id]:
                                                    if annotation == 'TUMOR_SUPPRESSOR' or annotation == 'ONCOGENE' or annotation == 'NETWORK_CG' or annotation == 'CANCER_PREDISPOSITION':
                                                        rec.INFO[
                                                            annotation] = True
                                                    else:
                                                        rec.INFO[annotation] = pcgr_onco_xref[
                                                            ensembl_transcript_id][
                                                                annotation]
                                    if vep_csq_index2fields[j] == 'DOMAINS':
                                        domain_identifiers = str(
                                            csq_fields[j]).split('&')
                                        for v in domain_identifiers:
                                            if v.startswith('Pfam_domain'):
                                                rec.INFO['PFAM_DOMAIN'] = str(
                                                    re.sub(
                                                        r'\.[0-9]{1,}$', '',
                                                        re.sub(
                                                            r'Pfam_domain:',
                                                            '', v)))

                                    if vep_csq_index2fields[
                                            j] == 'Existing_variation':
                                        var_identifiers = str(
                                            csq_fields[j]).split('&')
                                        cosmic_identifiers = []
                                        dbsnp_identifiers = []
                                        for v in var_identifiers:
                                            if v.startswith('COSM'):
                                                cosmic_identifiers.append(v)
                                            if v.startswith('rs'):
                                                dbsnp_identifiers.append(
                                                    re.sub('^rs', '', v))
                                        if len(cosmic_identifiers) > 0:
                                            rec.INFO[
                                                'COSMIC_MUTATION_ID'] = '&'.join(
                                                    cosmic_identifiers)
                                        if len(dbsnp_identifiers) > 0:
                                            rec.INFO['DBSNPRSID'] = '&'.join(
                                                dbsnp_identifiers)
                            j = j + 1
                        set_coding_change(rec)
                    symbol = '.'
                    if csq_fields[vep_csq_fields2index['SYMBOL']] != "":
                        symbol = str(
                            csq_fields[vep_csq_fields2index['SYMBOL']])
                    consequence_entry = str(
                        csq_fields[vep_csq_fields2index['Consequence']]
                    ) + ':' + str(symbol) + ':' + str(csq_fields[
                        vep_csq_fields2index['Feature_type']]) + ':' + str(
                            csq_fields[vep_csq_fields2index['Feature']]
                        ) + ':' + str(
                            csq_fields[vep_csq_fields2index['BIOTYPE']])
                    all_transcript_consequences.append(consequence_entry)

            if identifier == 'DBNSFP':
                if not rec.INFO.get('DBNSFP') is None:
                    map_variant_effect_predictors(
                        rec, dbnsfp_prediction_algorithms)
        rec.INFO['VEP_ALL_CONSEQUENCE'] = ','.join(all_transcript_consequences)
        w.write_record(rec)
    w.close()
    logger.info('Completed summary of functional annotations for ' +
                str(num_chromosome_records_processed) +
                ' variants on chromosome ' + str(current_chrom))
    vcf.close()

    if os.path.exists(out_vcf):
        if os.path.getsize(out_vcf) > 0:
            os.system('bgzip -f ' + str(out_vcf))
            os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz')
            annotated_vcf = out_vcf + '.gz'
            write_pass_vcf(annotated_vcf)
        else:
            pcgrutils.pcgr_error_message(
                'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)',
                logger)
    else:
        pcgrutils.pcgr_error_message(
            'No remaining PASS variants found in query VCF - exiting and skipping STEP 4 (pcgr-writer)',
            logger)
Ejemplo n.º 4
0
def process_vcf(vcf: str,
                tools_config: dict,
                thresholds: list,
                af_col: str,
                is_clinvar: bool = False):
    """
    Scans a VEP annotated VCF file and extracts all the
    tools scores

    :param str vcf: VCF file
    :param dict tools_config: Dict with the
        map between available tools and VCF
        annotations (already parsed, only
        existing fields should be passed here).
    :param list thresholds: List of tools
        to process based on the tools config file
        as well as the selected scope for the
        analysis
    :param str af_col: VCF field that measures
            population frequency of variants.
    :param bool is_clinvar: Whether `vcf` is
        from clinvar database.

    :return dict: Dict with scores keyed by variant
    :return list: List with the tools provided in the
    config that were not found in the VCF. (Will be
    used to update the config_tools variable)

    """
    logging.info("Extracting predictions from VCF.")

    # List of required INFO fields when
    # dataset is from Clinvar
    CLINVAR_FIELDS = ['CLNREVSTAT', 'CLNSIG']
    clinvar_confirmed = False

    # List of tools belonging to the
    # provided scope that will be analysed
    # Requires to be present in the updated config
    # (some tools might be removed from config after processing first file)
    TOOLS_TO_ANALYSE = [
        t[0] for t in thresholds if t[0] in tools_config.keys()
    ]

    # MAP with the list of VCF fields per tool
    TOOLS_CONFIG_MAP = {
        k: v[0]
        for k, v in tools_config.items() if k in TOOLS_TO_ANALYSE
    }

    # List with the VCF fields to check
    # Present fields in the header will be
    # popped out sequentially
    MISSING_VCF_FIELDS_FLAT = [
        i for sublist in [v for _k, v in TOOLS_CONFIG_MAP.items()]
        for i in sublist
    ]

    # Missing tools from VCF to return
    # and exclude from analysis
    MISSING_VCF_TOOLS = []

    # Indexes is an ordered dict with the index(s)
    # where information about a given tool is stored
    # in the VEP annotation field
    vep_indexes = OrderedDict()

    # List of VCF fields (not tool names) that are not
    # found in the VEP annotation field
    absent_from_VEP_annot = []

    # Dict with the list of tools and corresponding
    # VCF fields found in the INFO field as a single
    # annotation
    present_in_INFO = defaultdict(list)

    # List of tool names with all annotation fields
    # found in the VEP annotation field
    all_vep_annotations = []
    present_in_VEP_annot = []

    # Parse header and count number of variants
    vcf_data = VCF(vcf)
    if vcf_data.contains("ANN"):
        VEP_TAG = "ANN"
    elif vcf_data.contains("CSQ"):
        VEP_TAG = "CSQ"
    else:
        raise ValueError(
            "VETA requires VEP annotations. ANN or CSQ field was not found in "
            "the INFO field of the input VCF file. Exiting.\n")

    for field in vcf_data.header_iter():

        # If field is in VEP annotations
        if field["HeaderType"] == "INFO" and field["ID"] == VEP_TAG:
            all_vep_annotations = field["Description"].split(
                "Format:")[1][:-1].strip().split("|")
            if af_col in all_vep_annotations:
                vep_indexes[af_col] = [all_vep_annotations.index(af_col)]

            # Looks only for scores belonging
            # to the specific scope set
            # Also look for the AF column set
            for _tool in TOOLS_TO_ANALYSE:

                _tool_field = TOOLS_CONFIG_MAP[_tool]
                _present, _absent = _check_if_field_exists(
                    _tool_field, all_vep_annotations)

                if _present:
                    _fields_present = [v[0] for v in _present]
                    present_in_VEP_annot.extend(_fields_present)
                    vep_indexes[_tool] = [v[1] for v in _present]
                    [
                        MISSING_VCF_FIELDS_FLAT.remove(i)
                        for i in _fields_present
                    ]

                if _absent:
                    absent_from_VEP_annot.extend(_absent)

        elif field["HeaderType"] == "INFO":
            if field['ID'] in MISSING_VCF_FIELDS_FLAT:
                _tool_name = [
                    _t for _t, v in TOOLS_CONFIG_MAP.items()
                    if field['ID'] in v
                ][0]
                present_in_INFO[_tool_name].append(field['ID'])
                MISSING_VCF_FIELDS_FLAT.remove(field['ID'])

            if field['ID'] in CLINVAR_FIELDS:
                clinvar_confirmed = True

            if field['ID'] == af_col and af_col not in vep_indexes.keys(
            ) and af_col not in present_in_INFO.keys():
                present_in_INFO[af_col].append(field['ID'])

    if is_clinvar and not clinvar_confirmed:
        raise ValueError(
            'Input VCF is not from Clinvar ({}) fields were '
            'not found in the VCF INFO annotations. They are '
            'required for clinical significance extraction.\n'
            'Additionally, be aware that when running in '
            'benchmark mode, non-Clinvar files must be passed '
            'via an input directory where target files '
            'referring to each class are located.'.format(CLINVAR_FIELDS))

    # if there are VCF fields from
    # the config absent in VCF header
    # LOG absent fields
    if MISSING_VCF_FIELDS_FLAT:

        for tool, fields in TOOLS_CONFIG_MAP.items():

            for _f in fields:
                if _f in MISSING_VCF_FIELDS_FLAT:
                    logging.info("\'{}\' field does not exist "
                                 "in VCF. Corresponding tool "
                                 "(\'{}\') will be discarded from "
                                 "analysis.".format(_f, tool))

                    MISSING_VCF_TOOLS.append(tool)
                    # Delete tool from analysis
                    # if at least one subfield
                    # is not present
                    try:
                        del vep_indexes[tool]
                    except KeyError:
                        pass

                    try:
                        del present_in_INFO[tool]
                    except KeyError:
                        pass

    n_variants = 0
    for _ in vcf_data:
        n_variants += 1
    vcf_data.close()

    # Pack kwargs
    args = {
        'VEP_TAG': VEP_TAG,
        'all_vep_annotations': all_vep_annotations,
        'present_in_INFO': present_in_INFO,
        'vep_indexes': vep_indexes,
        'is_clinvar': is_clinvar
    }

    # Variant processing
    if n_variants > 5000:
        header = subprocess.Popen(["bcftools", "view", "-h", vcf],
                                  stdout=subprocess.PIPE).stdout.readlines()

        body_variants = tempfile.NamedTemporaryFile()
        subprocess.call(["bcftools", "view", "-H", vcf], stdout=body_variants)

        list_files = osutils.split_file_in_chunks(body_variants.name, header,
                                                  n_variants)
        with multiprocessing.Pool() as p:
            dict_list = list(
                tqdm(p.imap(partial(_iter_variants, **args), list_files)))

        logging.info("Merging data from parallel VCF processing.")
        scores = defaultdict(list)
        for d in dict_list:
            # if bool(d.keys() & scores.keys()):
            #    raise ValueError("Repeated variants in the input file.")
            scores.update(d)
        [os.remove(f) for f in list_files]
        logging.info("Done.")
        p.close()

    else:
        scores = _iter_variants(vcf, **args)

    return scores, MISSING_VCF_TOOLS
Ejemplo n.º 5
0
def main(
    in_vcf: str,
    out_vcf: str,
    overwrite: bool,
    verbose: bool,
    min_qual: float,
    min_depth: int,
    min_fed: float,
    max_depth: int,
    min_strand_bias: int,
    min_bqb: float,
    min_mqb: float,
    min_rpb: float,
    min_rpbz: Optional[float],
    max_rpbz: Optional[float],
    max_scbz: Optional[float],
    max_sgb: float,
    min_vdb: float,
    hist: bool,
    min_frs: float,
    min_mq: int,
):
    """Apply the following filters to a VCF:\n
    - Minimum proportion of the expected (median) depth\n
    - Maximum proportion of the expected (median) depth\n
    - Minimum QUAL threshold\n
    - Minimum Strand bias percentage
    """
    log_level = logging.DEBUG if verbose else logging.INFO
    logging.basicConfig(
        format="%(asctime)s [%(levelname)s]: %(message)s", level=log_level
    )

    vcf_reader = VCF(in_vcf)
    if not vcf_reader.contains(Tags.Depth.value):
        raise DepthTagError(f"Depth tag {Tags.Depth} not found in header")

    if (not vcf_reader.contains(str(Tags.StrandDepth))) and min_strand_bias:
        logging.warning(
            f"Strand depth tag {Tags.StrandDepth} not found in header. "
            f"Turning off strand bias filter..."
        )
        min_strand_bias = 0

    logging.info("Calculating expected (median) depth...")
    depths = []
    quals = []
    for v in vcf_reader:
        depths.append(get_depth(v))
        quals.append(v.QUAL or 0)

    median_depth = np.median(depths)
    logging.info(f"Expected depth: {median_depth}")

    if hist:
        import histoprint

        tick_format = "% .1f"
        logging.info("Depth histogram:")
        histoprint.print_hist(
            np.histogram(depths, bins=HIST_BINS),
            title="Depth histogram",
            summary=True,
            tick_format=tick_format,
            file=click.get_text_stream("stderr"),
        )

        logging.info("QUAL histogram")
        histoprint.print_hist(
            np.histogram(quals, bins=HIST_BINS),
            title="QUAL histogram",
            summary=True,
            tick_format=tick_format,
            file=click.get_text_stream("stderr"),
        )

    assessor = Filter(
        expected_depth=int(median_depth),
        min_qual=min_qual,
        min_depth=min_depth,
        min_fed=min_fed,
        max_depth=max_depth,
        min_strand_bias=min_strand_bias,
        min_bqb=min_bqb,
        min_mqb=min_mqb,
        min_rpb=min_rpb,
        max_sgb=max_sgb,
        min_vdb=min_vdb,
        min_frs=min_frs,
        min_mq=min_mq,
        min_rpbz=min_rpbz,
        max_rpbz=max_rpbz,
        max_scbz=max_scbz,
    )

    vcf_reader = VCF(in_vcf)
    assessor.add_filters_to_header(vcf_reader)

    if not Path(out_vcf).parent.exists():
        Path(out_vcf).parent.mkdir(exist_ok=True, parents=True)

    vcf_writer = Writer(out_vcf, tmpl=vcf_reader)

    stats = Counter()
    logging.info("Filtering variants...")
    for variant in vcf_reader:
        filter_status = assessor.filter_status(variant)

        if (
            (not overwrite)
            and variant.FILTER is not None
            and filter_status != str(Tags.Pass)
        ):
            current_filter = variant.FILTER.rstrip(";")
            variant.FILTER = f"{current_filter};{filter_status}"
        else:
            variant.FILTER = filter_status

        vcf_writer.write_record(variant)

        stats.update(filter_status.split(";"))

    vcf_reader.close()
    vcf_writer.close()

    logging.info("FILTER STATISTICS")
    logging.info("=================")
    for filt, count in stats.items():
        logging.info(f"Filter: {filt}\tCount: {count}")

    logging.info("Done!")
Ejemplo n.º 6
0
def main(arguments=None):
    args = parse_arguments()
    vcf = VCF(args["vcf_file"], threads=args["threads"])
    vcf.add_info_to_header(UPDATED)
    vcf.add_info_to_header(PANEL_FREQ_DIFF)
    vcf.add_info_to_header(MISSINGNES)
    vcf.add_info_to_header(MAF)
    if not vcf.contains("AF"):
        vcf.add_info_to_header(AF)
    w = Writer(args["output"], vcf)

    panel = generate_panel_data(
        panel_file=args["reference_panel"],
        chr=args["chromosomes"],
        annotation=args["chromosome_annotation"],
        panel_type=args["reference_panel_type"],
        separator=args["separator"],
    )

    vcf_summary = VCFSummary()

    print(
        toml_header % (
            args["outlier_threshold"],
            args["min_ambigious_threshold"],
            args["max_ambigious_threshold"],
            args["reference_panel_type"],
            args["chromosome_annotation"],
            args["ambigious"],
            args["fix_complement_ref_alt"],
        ),
        file=sys.stderr,
    )
    for variant in vcf:
        status = "unchanged"
        reason = "None"
        panel_variant_freq = None
        variant_id_end = str(variant.CHROM) + "_" + str(variant.end)
        if not variant.INFO.get("AF"):
            variant.INFO["AF"] = variant.aaf
        if variant_id_end in panel:
            variant.INFO["UPD"] = 0
            panel_variant = panel[variant_id_end]
            panel_variant_freq = panel_variant["freq"]
            vcf_summary.n_in_panel += 1
            if not variant.ALT:
                print_variant_toml(variant, panel_variant["freq"], "removed",
                                   "unknown_alt/monomorphic")
                vcf_summary.unknown_alt += 1
                continue
            if (args["ambigious"]
                    and variant.aaf > args["min_ambigious_threshold"]
                    and variant.aaf < args["max_ambigious_threshold"]):
                vcf_summary.ambigious += 1
                print_variant_toml(variant, panel_variant["freq"], "removed",
                                   "ambigious_frequency")
                continue
            if should_recode(variant, panel_variant):
                swap_ref_alt(variant)
                variant.INFO["UPD"] = 1
                vcf_summary.updated += 1
                status = "updated"
                reason = "ref_alt_swapped"
            if (should_flipstrand(variant, panel_variant)
                    and args["fix_complement_ref_alt"]):
                flipstrand(variant, panel_variant["freq"])
                variant.INFO["UPD"] = 1
                vcf_summary.flipped += 1
                status = "strand_flipped"
                reason = "ref/alt_not_in_panel_nucleotides"

            vcf_summary.add_variant(variant_id_end)
            v_freq = variant.INFO.get("AF")

            variant.INFO["PFD"] = abs(
                variant.INFO.get("AF") - panel_variant["freq"])
            variant.INFO["MISS"] = np.sum(variant.gt_types == 2) / len(
                variant.gt_types)
            variant.INFO["MAF"] = v_freq if v_freq < 0.5 else 1 - v_freq

            vcf_summary.VARIANTS[variant_id_end].update({
                "freq":
                variant.aaf,
                "panel_freq":
                panel_variant["freq"],
                "updated_freq":
                v_freq,
                "MAF":
                variant.INFO.get("MAF"),
                "MISS":
                variant.INFO.get("MISS"),
                "PFD":
                variant.INFO.get("PFD"),
                "v_id":
                variant.ID,
                "updated":
                variant.INFO.get("UPD"),
            })
            print_variant_toml(variant, panel_variant_freq, status, reason)
            vcf_summary.kept += 1
        w.write_record(variant)
    w.close()
    vcf.close()
    plot_file = re.sub(r"(vcf|bcf)(\.gz)*$", "png", args["output"])
    if not vcf_summary.n_in_panel == 0:
        create_summary_plot(vcf_summary,
                            outfile=plot_file,
                            threshold=args["outlier_threshold"])
    print(vcf_summary, file=sys.stderr)
    print("n_reference_panel_size=%d" % len(panel.keys()), file=sys.stderr)