def compute_hgvs(x): v = VCFVariant(x[CHR_COL], x[POS_COL], x[REF_COL], x[ALT_COL]) v = v.to_hgvs_obj(hgvs_proc.contig_maps[HgvsWrapper.GRCh38_Assem]) if normalize: vn = hgvs_proc.normalizing(v) v = vn if vn else v return hgvs_proc.genomic_to_cdna(v)
def convert_to_hg37(vars, brca_resources_dir): def pseudo_vcf_entry(v): entries = [v.chr, v.pos, '.', v.ref, v.alt, '', '', ''] return '\t'.join([str(s) for s in entries]) lst = [pseudo_vcf_entry(v) for v in vars] vcf_tmp = tempfile.mktemp('.vcf') with open(vcf_tmp, 'w') as f: f.write('\n'.join(lst)) vcf_tmp_out = tempfile.mktemp('.vcf') args = [ "CrossMap.py", "vcf", brca_resources_dir + "/hg38ToHg19.over.chain.gz", vcf_tmp, brca_resources_dir + "/hg19.fa", vcf_tmp_out ] logging.info("Running CrossMap.py to convert to hg19") sp = subprocess.Popen(args) out, err = sp.communicate() if out: logging.info("standard output of subprocess: {}".format(out)) if err: logging.info("standard output of subprocess: {}".format(err)) vcf_out_lines = open(vcf_tmp_out, 'r').readlines() return [ VCFVariant(v[0], int(v[1]), v[3], v[4]) for v in [l.strip().split('\t') for l in vcf_out_lines] ]
def test_hg19_to_hg38(hgvs_wrapper): vars38 = [ 'chr13:g.32316477:AAG>A', 'chr17:g.43079204:A>C', 'chr17:g.43053755:G>A', 'chr17:g.43125273:T>C', None ] # outside transcript boundaries vars37 = [ 'chr13:g.32890614:AAG>A', 'chr17:g.41231221:A>C', 'chr17:g.41205772:G>A', 'chr17:g.41277290:T>C', 'chr17:g.41279883:GAC>G' ] for (v37, v38) in zip(vars37, vars38): v37_obj = VCFVariant.from_str(v37).to_hgvs_obj( hgvs_wrapper.contig_maps[hgvs_utils.HgvsWrapper.GRCh37_Assem]) if v38: v38_obj = VCFVariant.from_hgvs_obj( hgvs_wrapper.hg19_to_hg38(v37_obj)) assert VCFVariant.from_str(v38) == v38_obj else: with pytest.raises(ValueError): hgvs_wrapper.hg19_to_hg38(v37_obj)
def from_genomic(parsedLine, fieldIdxDict, hgvs_wrapper, seq_fetcher37, errorsFile): acc = 'NC_0000' + str(parsedLine[fieldIdxDict['chromosome']].replace( 'chr', '')) + '.10' var_str = acc + ":" + parsedLine[fieldIdxDict['gDNA']] try: var_hgvs = hgvs_wrapper.hgvs_parser.parse(var_str) var_hgvs_norm = hgvs.normalizer.Normalizer( hgvs_wrapper.hgvs_dp, shuffle_direction=5).normalize(var_hgvs) return VCFVariant.from_hgvs_obj(var_hgvs_norm, seq_fetcher37) except HGVSError as e: print('Could not parse genomic field ' + str(var_str) + '. Error was ' + str(e), file=errorsFile) return None
def hgvs_variant(hgvs_wrapper): return (VCFVariant.from_str('chr13:g.32316477:AAG>A').to_hgvs_obj( hgvs_wrapper.contig_maps[hgvs_utils.HgvsWrapper.GRCh38_Assem]))
def main(input, output, pkl, log_path, config_file, resources): logging.basicConfig(filename=log_path, filemode="w", level=logging.INFO, format=' %(asctime)s %(filename)-15s %(message)s') cfg_df = config.load_config(config_file) syn_ac_dict = { x[config.SYMBOL_COL]: x[config.SYNONYM_AC_COL].split(';') for _, x in cfg_df.iterrows() } cdna_default_ac_dict = { x[config.SYMBOL_COL]: x[config.HGVS_CDNA_DEFAULT_AC] for _, x in cfg_df.iterrows() } hgvs_proc = HgvsWrapper() df = pd.read_csv(input, sep='\t') df[VAR_OBJ_FIELD] = df.apply( lambda x: VCFVariant(x[CHR_COL], x[POS_COL], x[REF_COL], x[ALT_COL]), axis=1) #### CDNA conversions df[TMP_CDNA_NORM_FIELD] = _get_cdna(df, pkl, hgvs_proc, cdna_default_ac_dict, normalize=True) df[PYHGVS_CDNA_COL] = df[TMP_CDNA_NORM_FIELD].apply(str) available_cdna = df[PYHGVS_CDNA_COL].str.startswith("NM_") df.loc[available_cdna, REFERENCE_SEQUENCE_COL] = df.loc[ available_cdna, PYHGVS_CDNA_COL].str.split(':').apply(lambda l: l[0]) df.loc[available_cdna, HGVS_CDNA_COL] = df.loc[ available_cdna, PYHGVS_CDNA_COL].str.split(':').apply(lambda l: l[1]) # still setting a reference sequence for downstream steps, even though no cDNA could be determined df.loc[~available_cdna, REFERENCE_SEQUENCE_COL] = df.loc[ ~available_cdna, GENE_SYMBOL_COL].apply(lambda g: cdna_default_ac_dict[g]) df.loc[~available_cdna, HGVS_CDNA_COL] = '-' #### Genomic Coordinates df[PYHGVS_GENOMIC_COORDINATE_38_COL] = df[VAR_OBJ_FIELD].apply( lambda v: str(v)) var_objs_hg37 = convert_to_hg37(df[VAR_OBJ_FIELD], resources) df[PYHGVS_GENOMIC_COORDINATE_37_COL] = pd.Series( [str(v) for v in var_objs_hg37]) df[PYHGVS_HG37_START_COL] = pd.Series([v.pos for v in var_objs_hg37]) df[PYHGVS_HG37_END_COL] = df[PYHGVS_HG37_START_COL] + (df[HG38_END_COL] - df[HG38_START_COL]) #### Protein df[PYHGVS_PROTEIN_COL] = df[TMP_CDNA_NORM_FIELD].apply( lambda x: str(hgvs_proc.to_protein(x))) #### Synonyms df[NEW_SYNONYMS_FIELD] = df.apply( lambda s: get_synonyms(s, hgvs_proc, syn_ac_dict), axis=1) df[SYNONYMS_COL] = df[SYNONYMS_COL].fillna('').str.strip() # merge existing synonyms with generated ones and sort them df[SYNONYMS_COL] = df.apply(_merge_synonyms, axis=1) #### Writing out # cleaning up temporary fields df = df.drop( columns=[VAR_OBJ_FIELD, NEW_SYNONYMS_FIELD, TMP_CDNA_NORM_FIELD]) df.to_csv(output, sep='\t', index=False)