if re.search(r"Not required", error) == None and error != 'chr0:0 .. REF allele listed in the ALT field??' and error != '': raise Exception('ERROR: vcf [' + vcf_path + '] not valid [' + error + ']') ''' #must match platform definition - no extra/missing records #CHROM,POS,ID,REF,ALT must match #Same sort order - may be able to relax this requirement later #samples must also match between the summary and vcf err1 = "ERROR: VCF file [{0}] does not match PLATFORM file [{1}] on {2} at variant record {3}. VCF={4}; PLATFORM={5}" err2 = "ERROR: VCF file [{0}] does not match PLATFORM file [{1}]. Different line counts." fields = ["CHROM","POS","ID","REF","ALT"] with gzip.open(vcf_path, 'rb') as vcf, gzip.open(summary_path, 'rb') as summary: #remove header comments from each file and capture header row vcfhead = bankfunctions.read_through_headers(vcf,'##') #platformhead = bankfunctions.read_through_headers(platform,'##') summaryhead = bankfunctions.read_through_headers(summary,'#') #check samples match between vcf and summary to ensure correct summary file selected (assume posterior file matches by virtue of being in same directory) vcf_samples = vcfhead.strip().split('\t') pattern = re.compile("\.cel", re.IGNORECASE) summary_samples = summaryhead.strip().split('\t') summary_samples_no_cel = [pattern.sub("", sample) for sample in summary_samples[1:]] if vcf_samples[9:] != summary_samples_no_cel[0:] : raise Exception('ERROR: vcf [' + vcf_path + '] samples do not match samples in summary [' + summary_path + ']') sample_count = len(summary_samples_no_cel) ''' # PENDING: Commenting out due to issues with sorted vs not sorted files. Can safely assume correctly formed at the moment, as all input through GXBANK_CONVERT script #check variants match between vcf and platform var_count = 0
## filter_num = lambda key,value,criteria: key if value != 'NA' and value != '' and float(value) < criteria else None ## # FILTER Thresholds ## thresholds_homRO = {'0':0.6, '1':0.6, '2':0.3, '3':-0.9} with gzip.open(callFile, 'rb') as calls, \ gzip.open(confFile, 'rb') as confs, \ gzip.open(perfFile, 'rb') as perfs, \ gzip.open(platformVCF_path,"rb") as plat_vcf, \ gzip.open(vcfFileUnsorted,"wb") as vcf : #Remove headers bankfunctions.read_through_headers(perfs,'#') bankfunctions.read_through_headers(plat_vcf,'##') #Confirm order of samples in each Axiom file matches call = bankfunctions.read_through_headers(calls,'#').strip().split('\t') conf = bankfunctions.read_through_headers(confs,'#').strip().split('\t') #if len(call) != len(conf) or len(conf) != len(sum) : if len(call) != len(conf) : raise Exception(err4.format(axiomdir,'AxiomGT1.calls.txt.gz',len(call) - 1,'AxiomGT1.confidences.txt.gz',len(conf) - 1)) for i in range(1,len(call)) : #if call[i] != conf[i] or conf[i] != sum[i] : if call[i] != conf[i] : raise Exception(err5.format(axiomdir,i,'AxiomGT1.calls.txt.gz',call[i],'AxiomGT1.confidences.txt.gz',conf[i])) #log number of samples logging.info('[%d] samples found in these Axiom files', len(call) - 1)