def get_indel_info(self): '''处理indel vcf文件 处理indel的vcf文件时,为了和注释的pos,以及ref和alt对应,需要对vcf做处理 { 'chr_pos_ref/alt': [(case), (control)] } ''' with utils.safe_open(self.vcf, 'r') as fr: for line in fr: if line.startswith('##'): pass elif line.startswith('#'): head = line.strip('') head_index = utils.get_head_index(head) continue linelist = line.strip('').split('\t') _chr = linelist[head_index['#chrom']] pos = linelist[head_index['pos']] ref = linelist[head_index['ref']] alt = linelist[head_index['alt']] pos, ref, alt = utils.modify_pos_ref_alt(pos, ref, alt) if 'cancer' in head_index: case = linelist[head_index['cancer']] if 'normal' in head_index: control = linelist[head_index['normal']]
def get_tran_relation(self): ''' input: self.transcript: gene transcript output: list: [gene=trans, gene=trans] ''' tran_relation = [] with utils.safe_open(self.transript_database, 'r') as fr: for line in fr: if line.startswith('#'): head_index = utils.get_head_index(line) continue linelist = line.strip('').split('\t') gene = linelist[head_index['#gene']] tran = linelist[head_index['transcript']] tran_relation.append('{gene}={tran}'.format(**locals())) return tran_relation
print(len(rules)) relation_to_body = {} relation_to_body = add_relation_body(rules, relation_to_body) print(len(relation_to_body)) # # Length 1 Rules # In[126]: dict_1 = utils.get_r_e1e2_dict(mapped_train_data) # # Length 2 Rules # In[127]: index_head = utils.get_head_index(mapped_train_data) dict_2 = utils.get_r1r2_e1e2_dict(mapped_train_data, index_head) # ## Length 3 Rules # In[128]: ## Gives entity in path for given relation and body # In[129]: re2_e1 = utils.get_re2_e1_dict(mapped_train_data) e1r_e2 = utils.get_e1r_e2_dict(mapped_train_data) # In[130]:
def start(self): ''' 程序运行主函数 ''' if self.vcf: vcf_info = get_vcf_info.HandleVcf(self.vcf, self.vcftype).start() with open(self.vep, 'r') as fr, open('test', 'w') as fw: for line in fr: if line.startswith('##'): continue elif line.startswith('#Uploaded_variation'): head = line head_index = utils.get_head_index(head) continue linelist = line.strip('').split('\t') #获取需要的信息 row = headers.HEAD() ## 可以直接提取的信息 upload_variation = linelist[head_index[ '#Uploaded_variation'.lower()]] # chr5_1295229_-/A location = linelist[ head_index['location']] # chr5:1295187-1295188 transcript = linelist[head_index['feature']] # NM_198253.3 function = linelist[ head_index['consequence']] # missense_variant strand = linelist[head_index['strand']] # -1 gene = linelist[head_index['symbol']] # TERT protein = linelist[head_index['ensp']] # NP_937983.2 sift = linelist[head_index['sift']] # tolerated(0.05) polyphen = linelist[head_index['polyphen']] exon_id = linelist[head_index['exon']] # 2/19 or - chgvs = linelist[head_index['hgvsc']] # NM_198253.3:c.77C>T phgvs = linelist[head_index['hgvsp']] # NP_937983.2:p.Thr26Met tert = linelist[head_index['tert']] # 只有tert的启动子区域有 clinvar = linelist[head_index['clinvar_clnsig']] rs = linelist[head_index['existing_variation']] # 需要进行处理获取的信息 hgvsc = utils.simplify_hgvsc(chgvs) hgvsp2 = utils.simplify_hgvsp(phgvs) hgvsp = utils.get_oneletter_hgvsp(hgvsp2) exon_id = utils.get_exon_id(exon_id) _chr, start, end = utils.get_chr_start_end_from_location( location) ref, alt = utils.get_ref_alt_from_upload_variation( upload_variation) muttype = utils.get_muttype(ref, alt) genotype = utils.get_genotype(ref, alt, strand) flank = utils.get_flank_according_upload_variation( upload_variation, self.hg19) bl_muttype = utils.get_bl_muttype() # 更新row row.gene = gene row.chgvs = hgvsc row.phgvs = hgvsp row.phgvs2 = hgvsp2 row.exon_id = exon_id row.vep_function = function row.sift = sift row.polyphen2 = polyphen row.chr = _chr row.start = start row.end = end row.ref = ref row.alt = alt row.muttype = muttype row.genotype = genotype row.transcript = transcript row.protein = protein row.strand = strand row.flank = flank row.rs = rs row.bl_muttype = bl_muttype row.clinvar = clinvar if self.vcf: freq_tag = vcf_info[upload_variation] print(freq_tag) info = row.update_head(**freq_tag) fw.write('\t'.join(info.keys()) + '\n') fw.write('\t'.join(map(str, info.values())) + '\n')
if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='test') parser.add_argument('--infile') parser.add_argument('--result') parser.add_argument('--vep_function_yaml', help='func 配置文件') args = vars(parser.parse_args()) infile = args['infile'] result = args['result'] with open(infile, 'r') as fr, open(result, 'w') as fw: for line in fr: linelist = line.strip('\n').split('\t') if line.startswith('#'): head_index = utils.get_head_index(linelist) fw.write('{}\tvep_simple\tbgi_func\n'.format( '\t'.join(linelist))) continue nm = linelist[head_index['feature']] if not nm.startswith('NM'): continue gene = linelist[head_index['symbol']] upload_variation = linelist[head_index[ '#Uploaded_variation'.lower()]] vep_function = linelist[head_index['consequence']] chgvs = linelist[head_index['hgvsc']] phgvs = linelist[head_index['hgvsp']] tert = linelist[head_index['tert']]
def get_snv_info(self): '''处理snv vcf文件 读取INFO字段,获取需要的信息 vcf_info = { chr_pos_ref/alt: [(case), (control)] } key的形式和vep注释结果中的upload_variation字段一致 ''' vcf_info = defaultdict(dict) with utils.safe_open(self.vcf, 'r') as fr: for line in fr: if line.startswith('##'): continue elif line.startswith('#'): head = line.strip('') head_index = utils.get_head_index(head) continue linelist = line.strip('').split('\t') _chr = linelist[head_index['#chrom']] pos = linelist[head_index['pos']] ref = linelist[head_index['ref']] alt = linelist[head_index['alt']] key = '{_chr}_{pos}_{ref}/{alt}'.format(**locals()) info = linelist[head_index['info']] case = re.search(self.case_pattern, info) control = re.search(self.control_pattern, info) if case: case_ref_positive = int(case.group(1)) case_ref_negative = int(case.group(2)) case_alt_positive = int(case.group(3)) # read2 is alt read case_alt_negative = int(case.group(4)) case_ref = case_ref_positive + case_ref_negative case_alt = case_alt_positive + case_alt_negative case_pos_depth = case_ref + case_alt case_var_freq = (case_alt / case_pos_depth) * 100 vcf_info[key].update({ 'Case_ref_readsNum': case_ref, 'Case_var_readsNum': case_alt, 'Case_var_Positive_readsNum': case_alt_positive, 'Case_var_Negative_readsNum': case_alt_negative, 'Case_pos_dep': case_pos_depth, 'Case_var_freq': case_var_freq }) if control: control_ref_positve = int(control.group(1)) control_ref_negative = int(control.group(2)) control_alt_positve = int(control.group(3)) control_alt_negative = int(control.group(4)) control_ref = control_ref_positve + control_ref_negative control_alt = control_alt_positve + control_alt_negative control_pos_depth = control_ref + control_alt vcf_info[key].update({ 'Ctrl_ref_readsNum': control_ref, 'Ctrl_var_readsNum': control_alt, 'Ctrl_var_Positive_readsNum': control_alt_positve, 'Ctrl_var_Negative_readsNum': control_alt_negative, 'Ctrl_pos_dep': control_pos_depth }) return vcf_info