Exemple #1
0
    def get_indel_info(self):
        '''处理indel vcf文件
        处理indel的vcf文件时,为了和注释的pos,以及ref和alt对应,需要对vcf做处理
        {
            'chr_pos_ref/alt': [(case), (control)]
        }
        '''
        with utils.safe_open(self.vcf, 'r') as fr:
            for line in fr:
                if line.startswith('##'):
                    pass
                elif line.startswith('#'):
                    head = line.strip('')
                    head_index = utils.get_head_index(head)
                    continue

                linelist = line.strip('').split('\t')
                _chr = linelist[head_index['#chrom']]
                pos = linelist[head_index['pos']]
                ref = linelist[head_index['ref']]
                alt = linelist[head_index['alt']]

                pos, ref, alt = utils.modify_pos_ref_alt(pos, ref, alt)
                if 'cancer' in head_index:
                    case = linelist[head_index['cancer']]
                if 'normal' in head_index:
                    control = linelist[head_index['normal']]
    def get_tran_relation(self):
        '''
        input:
            self.transcript: gene transcript
        output:
            list: [gene=trans, gene=trans]
        '''
        tran_relation = []
        with utils.safe_open(self.transript_database, 'r') as fr:
            for line in fr:
                if line.startswith('#'):
                    head_index = utils.get_head_index(line)
                    continue
                linelist = line.strip('').split('\t')
                gene = linelist[head_index['#gene']]
                tran = linelist[head_index['transcript']]
                tran_relation.append('{gene}={tran}'.format(**locals()))

        return tran_relation
Exemple #3
0
print(len(rules))
relation_to_body = {}
relation_to_body = add_relation_body(rules, relation_to_body)
print(len(relation_to_body))

# # Length 1 Rules

# In[126]:

dict_1 = utils.get_r_e1e2_dict(mapped_train_data)

# # Length 2 Rules

# In[127]:

index_head = utils.get_head_index(mapped_train_data)
dict_2 = utils.get_r1r2_e1e2_dict(mapped_train_data, index_head)

# ## Length 3 Rules

# In[128]:

## Gives entity in path for given relation and body

# In[129]:

re2_e1 = utils.get_re2_e1_dict(mapped_train_data)
e1r_e2 = utils.get_e1r_e2_dict(mapped_train_data)

# In[130]:
    def start(self):
        '''
        程序运行主函数
        '''
        if self.vcf:
            vcf_info = get_vcf_info.HandleVcf(self.vcf, self.vcftype).start()

        with open(self.vep, 'r') as fr, open('test', 'w') as fw:
            for line in fr:
                if line.startswith('##'):
                    continue
                elif line.startswith('#Uploaded_variation'):
                    head = line
                    head_index = utils.get_head_index(head)
                    continue

                linelist = line.strip('').split('\t')
                #获取需要的信息
                row = headers.HEAD()
                ## 可以直接提取的信息
                upload_variation = linelist[head_index[
                    '#Uploaded_variation'.lower()]]  # chr5_1295229_-/A
                location = linelist[
                    head_index['location']]  # chr5:1295187-1295188
                transcript = linelist[head_index['feature']]  # NM_198253.3
                function = linelist[
                    head_index['consequence']]  # missense_variant
                strand = linelist[head_index['strand']]  # -1
                gene = linelist[head_index['symbol']]  # TERT
                protein = linelist[head_index['ensp']]  # NP_937983.2
                sift = linelist[head_index['sift']]  # tolerated(0.05)
                polyphen = linelist[head_index['polyphen']]
                exon_id = linelist[head_index['exon']]  # 2/19 or -
                chgvs = linelist[head_index['hgvsc']]  # NM_198253.3:c.77C>T
                phgvs = linelist[head_index['hgvsp']]  # NP_937983.2:p.Thr26Met
                tert = linelist[head_index['tert']]  # 只有tert的启动子区域有
                clinvar = linelist[head_index['clinvar_clnsig']]
                rs = linelist[head_index['existing_variation']]

                # 需要进行处理获取的信息
                hgvsc = utils.simplify_hgvsc(chgvs)
                hgvsp2 = utils.simplify_hgvsp(phgvs)
                hgvsp = utils.get_oneletter_hgvsp(hgvsp2)
                exon_id = utils.get_exon_id(exon_id)
                _chr, start, end = utils.get_chr_start_end_from_location(
                    location)
                ref, alt = utils.get_ref_alt_from_upload_variation(
                    upload_variation)
                muttype = utils.get_muttype(ref, alt)
                genotype = utils.get_genotype(ref, alt, strand)
                flank = utils.get_flank_according_upload_variation(
                    upload_variation, self.hg19)
                bl_muttype = utils.get_bl_muttype()

                # 更新row
                row.gene = gene
                row.chgvs = hgvsc
                row.phgvs = hgvsp
                row.phgvs2 = hgvsp2
                row.exon_id = exon_id
                row.vep_function = function
                row.sift = sift
                row.polyphen2 = polyphen
                row.chr = _chr
                row.start = start
                row.end = end
                row.ref = ref
                row.alt = alt
                row.muttype = muttype
                row.genotype = genotype
                row.transcript = transcript
                row.protein = protein
                row.strand = strand
                row.flank = flank
                row.rs = rs
                row.bl_muttype = bl_muttype
                row.clinvar = clinvar

                if self.vcf:
                    freq_tag = vcf_info[upload_variation]
                    print(freq_tag)
                info = row.update_head(**freq_tag)
                fw.write('\t'.join(info.keys()) + '\n')
                fw.write('\t'.join(map(str, info.values())) + '\n')
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='test')
    parser.add_argument('--infile')
    parser.add_argument('--result')
    parser.add_argument('--vep_function_yaml', help='func 配置文件')
    args = vars(parser.parse_args())

    infile = args['infile']
    result = args['result']

    with open(infile, 'r') as fr, open(result, 'w') as fw:
        for line in fr:
            linelist = line.strip('\n').split('\t')
            if line.startswith('#'):
                head_index = utils.get_head_index(linelist)
                fw.write('{}\tvep_simple\tbgi_func\n'.format(
                    '\t'.join(linelist)))
                continue

            nm = linelist[head_index['feature']]
            if not nm.startswith('NM'):
                continue

            gene = linelist[head_index['symbol']]
            upload_variation = linelist[head_index[
                '#Uploaded_variation'.lower()]]
            vep_function = linelist[head_index['consequence']]
            chgvs = linelist[head_index['hgvsc']]
            phgvs = linelist[head_index['hgvsp']]
            tert = linelist[head_index['tert']]
Exemple #6
0
    def get_snv_info(self):
        '''处理snv vcf文件
        读取INFO字段,获取需要的信息
        vcf_info = {
            chr_pos_ref/alt: [(case), (control)]
        }
        key的形式和vep注释结果中的upload_variation字段一致
        '''
        vcf_info = defaultdict(dict)
        with utils.safe_open(self.vcf, 'r') as fr:
            for line in fr:
                if line.startswith('##'):
                    continue
                elif line.startswith('#'):
                    head = line.strip('')
                    head_index = utils.get_head_index(head)
                    continue

                linelist = line.strip('').split('\t')
                _chr = linelist[head_index['#chrom']]
                pos = linelist[head_index['pos']]
                ref = linelist[head_index['ref']]
                alt = linelist[head_index['alt']]
                key = '{_chr}_{pos}_{ref}/{alt}'.format(**locals())

                info = linelist[head_index['info']]
                case = re.search(self.case_pattern, info)
                control = re.search(self.control_pattern, info)
                if case:
                    case_ref_positive = int(case.group(1))
                    case_ref_negative = int(case.group(2))
                    case_alt_positive = int(case.group(3))  # read2 is alt read
                    case_alt_negative = int(case.group(4))
                    case_ref = case_ref_positive + case_ref_negative
                    case_alt = case_alt_positive + case_alt_negative
                    case_pos_depth = case_ref + case_alt
                    case_var_freq = (case_alt / case_pos_depth) * 100
                    vcf_info[key].update({
                        'Case_ref_readsNum': case_ref,
                        'Case_var_readsNum': case_alt,
                        'Case_var_Positive_readsNum': case_alt_positive,
                        'Case_var_Negative_readsNum': case_alt_negative,
                        'Case_pos_dep': case_pos_depth,
                        'Case_var_freq': case_var_freq
                    })
                if control:
                    control_ref_positve = int(control.group(1))
                    control_ref_negative = int(control.group(2))
                    control_alt_positve = int(control.group(3))
                    control_alt_negative = int(control.group(4))
                    control_ref = control_ref_positve + control_ref_negative
                    control_alt = control_alt_positve + control_alt_negative
                    control_pos_depth = control_ref + control_alt
                    vcf_info[key].update({
                        'Ctrl_ref_readsNum': control_ref,
                        'Ctrl_var_readsNum': control_alt,
                        'Ctrl_var_Positive_readsNum': control_alt_positve,
                        'Ctrl_var_Negative_readsNum': control_alt_negative,
                        'Ctrl_pos_dep': control_pos_depth
                    })

        return vcf_info