/
compare_mafs_tcgaInGdc_v3.py
150 lines (118 loc) · 5.46 KB
/
compare_mafs_tcgaInGdc_v3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/python
from pyliftover import LiftOver
import sys
import re
import glob
import pybedtools
from pybedtools import BedTool
# Print USAGE
if len(sys.argv) < 3:
print """\
There are some missing arguments modified.
Usage: compare_mafs MAF_FILE_GDC MAF_FILE_TCGA
MAF_FILE_1: Path for GDC maf file
MAF_FILE_2: Path for TCGA maf file
"""
sys.exit()
else:
gdc_maf_project = sys.argv[1]
tcga_maf_file = sys.argv[2]
# Read files in GDC path
gdc_maf_files = glob.glob('../kossproject/*_maf_files_tcga/TCGA.' + gdc_maf_project + '*.maf')
nfiles_gdc = len(gdc_maf_files)
# Read crossing reference
lo = LiftOver('hg19', 'hg38')
fastaRef = pybedtools.example_filename('/mnt/GDCpaper/Homo_sapiens.GRCh38.dna.primary_assembly.fa')
# Variables for count FP, FN, TP, TN
pair_list = {}
TP=0
FP=0
total=0
noncross=0
diffref=0
# Reading each file separately
gdc_var_files_list = [None] * nfiles_gdc
gdc_pairs = []
file = 0
for maf_file in gdc_maf_files:
# Reading GDC maf file
gdc_var_list = []
print "Retrieving variant keys from " + maf_file + " in GDC ..."
with open(maf_file) as f:
for line in f:
# Read columns for each variant in the MAF file
columns = line.split('\t')
# Filter empty rows and headers
if len(columns)>2 and columns[0] != "Hugo_Symbol":
# Filtering variants in GDC
# 1) SNPs
# 2) Filter "PASS" or 'common_mutation'
if columns[9] == "SNP" and (columns[108] == "PASS" or columns[108] == "common_variant"):
samples_pair = ' '.join([columns[15], columns[16]])
# gdc_var_list[' '.join([samples_pair, columns[4], columns[5], columns[6], columns[7]])] = columns[10]
gdc_var_list.append(' '.join([columns[4], columns[5], columns[6], columns[7], samples_pair]))
# position = columns[4].replace('chr', '') + ':' + columns[5] + '-' + columns[6]
# refbase = BedTool.seq(position, fastaRef)
# print "{0} {1} {2}".format(position, columns[10], refbase)
# Check samples in GDC
if samples_pair not in gdc_pairs:
gdc_pairs += [samples_pair]
# Close GDC MAF file
f.close()
print "{0} GDC variants considered in {1}".format(len(gdc_var_list), maf_file)
gdc_var_files_list[file] = gdc_var_list
file += 1
# Reading TCGA maf file
print "Checking variants keys in TCGA..."
total_variants = 0
with open(tcga_maf_file) as f:
for line in f:
# Read columns for each variant in the MAF file
columns = line.split('\t')
# Filter empty rows and headers
if len(columns)>2 and columns[0] != "Hugo_Symbol":
pair_key = columns[15] + ' ' + columns[16]
# Filtering variants in TCGA
# 1) SNPs
# 2) This sample comparison exists in GDC
if columns[9] == "SNP" and pair_key in gdc_pairs:
start = lo.convert_coordinate('chr' + columns[4], int(columns[5]))
end = lo.convert_coordinate('chr' + columns[4], int(columns[6]))
total_variants += 1
# Check if reference has been correctly crossed
if start is not None and end is not None and len(start)==1 and len(end)==1:
refbase = BedTool.seq(start[0][0].replace('chr','') + ':' + str(start[0][1]) + '-' + str(end[0][1]), fastaRef)
# Check if reference in TCGA is the same in hg38 ref
if refbase == columns[10]:
variant_key = ' '.join([start[0][0], str(start[0][1]), str(end[0][1]), start[0][2], columns[15], columns[16]])
# Create pair if it is not created
if pair_key in pair_list:
pair_list[pair_key][4] += 1
else:
pair_list[pair_key] = [0] * nfiles_gdc + [1]
# Check if this is a TP in all gdc files
for i in range(0,nfiles_gdc):
if variant_key in gdc_var_files_list[i]:
pair_list[pair_key][i] += 1
# if pair_list[pair_key][4] % 100 == 0:
# print "PARTIAL TP={0} TOTAL={1} Recall={2}".format(pair_list[pair_key][0], pair_list[pair_key][4], float(pair_list[pair_key][0])/float(pair_list[pair_key][4]))
else:
diffref +=1
else:
noncross += 1
# Close GDC MAF file
f.close()
total = 0
tp = [0] * nfiles_gdc
recall_file = gdc_maf_project + '_recall' + '.txt'
f = open(recall_file, 'w')
f.write("Samples\tMuSE\tMuTect2\tSomaticSniper\tVarScan2\tTotal\n")
for key in pair_list:
total += pair_list[key][4]
tp = [tp[0]+pair_list[key][0], tp[1]+pair_list[key][1], tp[2]+pair_list[key][2], tp[3]+pair_list[key][3]]
f.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(key, pair_list[key][0], pair_list[key][1], pair_list[key][2], pair_list[key][3], pair_list[key][4]))
f.close()
print "TOTAL: {0}\t{1}\t{2}\t{3}".format(float(tp[0])/float(total),float(tp[1])/float(total),float(tp[2])/float(total),float(tp[3])/float(total))
print "Cases = {0}".format(len(pair_list))
print "Non Crossed variants = {0}/{1}".format(noncross,total_variants)
print "Ref seq not matching variants = {0}/{1}".format(diffref,total_variants)