def main(): global prev_lines #parse ARGs parser = argparse.ArgumentParser(description='Filter SNP positions by call quality and min. coverage. Awaits filenames for M, P .vcf files, and M, P .sam files.') parser.add_argument('filenames', type=str, nargs='+', help='paths to .vcf files with M, P SNPs and to corresponding .sam files') args = parser.parse_args() if len(args.filenames) != 4: exit("Unexpected number of arguments passed! Expecting 4 filenames.") #treat these as CONSTANTS! M = 0; P = 1; MR = 2; PR = 3; ALL_VCF = [M, P] ALL = [M, P, MR, PR] ''' Get union of the M and P SNP positions ''' print " Stage 1" #list of input files in_files = [open(args.filenames[i], "r" ) for i in ALL] #dictionary to store the union of SNP positions data = dict() loci = dict() #read the first SNP from M, P vcf files snps = [[] for i in ALL_VCF] for i in ALL_VCF: #skip the header line = in_files[i].readline() while len(line) > 0 and line[0] == '#': line = in_files[i].readline() #split snps[i] = line.split('\t') #get list of all positions in UNION of M and P SNP positions while len(snps[M])>2 or len(snps[P])>2: #while there is a SNP positions in M or P #get the position of SNP that occure first for i in ALL_VCF: if snps[i][0] == '': #if an input files is already at EOF snps[i][0] = 'chrZZ' snps[i].append(1e15) else: #convert to int snps[i][1] = int(snps[i][1]) #chromosome min_chr = min(snps[M][0], snps[P][0]) #position min_pos = 1e15 for i in ALL_VCF: if min_chr == snps[i][0] and snps[i][1] < min_pos: min_pos = snps[i][1] #get alleles alleles = [['.', '.'] for x in [M, P]] for i in [M, P]: #if there is a SNP in the data at this position, use it if min_chr == snps[i][0] and min_pos == snps[i][1]: alleles[i] = [snps[i][3][0], snps[i][4][0]] #if there is no info on some allele, impute the reference allele ref_allele = alleles[M][0] if ref_allele == '.': ref_allele = alleles[P][0] for i in [M, P]: for a in [0, 1]: if alleles[i][a] == '.': alleles[i][a] = ref_allele #check for homozygous alternative sites for i in [M, P]: #if there is a SNP in the data at this position if min_chr == snps[i][0] and min_pos == snps[i][1]: info = snps[i] if len(info) <= 2: continue #parse out genotype config info gt = info[9].split(':')[0] #if homozygous alternative if gt[0] == '1': alleles[i][0] = snps[i][4][0] if gt[2] == '1': alleles[i][1] = snps[i][4][0] if gt[0] == '0': alleles[i][0] = snps[i][3][0] if gt[2] == '0': alleles[i][1] = snps[i][3][0] #take note that for this position we need to get allele counts in plasma samaples loci[min_pos] = alleles sp.add_pos(min_pos, data) #read input: next SNP for i in ALL_VCF: if min_chr >= snps[i][0] and min_pos >= snps[i][1]: snps[i] = in_files[i].readline().split('\t') #END WHILE ''' Get coverage information of the SNP positions from corresponding .sam files ''' print " Stage 2" #fetch allele support for the UNION positions in maternal and paternal reads #set up datastructures for counting allele support in diffrenct SAM files posInfo = [dict() for i in ALL] for R in [MR, PR]: posInfo[R] = copy.deepcopy(data) #fetch the reads in plasma SAM file and get counts for the positions originally specified in 'data' for R in [MR, PR]: while True: line = in_files[R].readline() if not line: break if len(line) > 0 and line[0] == '@': continue sp.pile_up(sp.mapping_parser(line), posInfo[R]) ''' Filter the SNP positions according to call quality and coverage ''' print " Stage 3" #reopen VCF files for f in in_files: f.close() in_files = [open(args.filenames[i], "r" ) for i in ALL_VCF] #list of output files out_files = [open(args.filenames[i][:-3]+"ftr.vcf", "w") for i in ALL_VCF] #read the first SNP from M, P vcf files snps = [[] for i in ALL_VCF] for i in ALL_VCF: snps[i] = getlineFromFile(in_files, out_files, i) #positions ignored from the union of M and P ignored_pos = 0 while len(snps[M])>2 or len(snps[P])>2: #while there is a SNP positions in M or P #get the position of SNP that occure first for i in ALL_VCF: if snps[i][0] == '': #if an input files is already at EOF snps[i][0] = 'chrZZ' snps[i].append(1e15) else: #convert to int snps[i][1] = int(snps[i][1]) #chromosome min_chr = min(snps[M][0], snps[P][0]) #position min_pos = 1e15 for i in ALL_VCF: if min_chr == snps[i][0] and snps[i][1] < min_pos: min_pos = snps[i][1] #get genotype call quality callQ = [0. for i in ALL_VCF] for i in ALL_VCF: #if there is a SNP in the data at this position if min_chr == snps[i][0] and min_pos == snps[i][1]: info = snps[i] if len(info) <= 2: continue #parse out quality info callQ[i] = float(info[5]) qualityOK = bool(callQ[M] >= 75 or callQ[P] >= 75) #get coverage info alleles = loci[min_pos] coverage = [0 for i in [M, P]] count_sum = [0 for i in [M, P]] for i in [M, P]: a1 = alleles[i][0] a2 = alleles[i][1] count_a1 = 0 count_a2 = 0 try: count_a1 = posInfo[i+2][min_pos][a1] except: print i, min_pos, a1, posInfo[i+2][min_pos], alleles[i] try: count_a2 = posInfo[i+2][min_pos][a2] except: print i, min_pos, a2, posInfo[i+2][min_pos], alleles[i] count_sum[i] = sum(posInfo[i+2][min_pos].values()) coverage[i] = count_a1 + count_a2 #posInfo[i+2][min_pos][a1] + posInfo[i+2][min_pos][a2] """ #using mpileup to get the coverage info cmd = 'samtools mpileup -r %(chr)s:%(pos)d-%(pos)d __%(gnm)s.part.bam' % {'chr':min_chr, 'pos':min_pos, 'gnm':'MP'[i]} process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = process.communicate() if process.returncode == 0: fields = out.split('\t') if len(fields) < 4: #there is no coverage in the BAM file for this position coverage[i] = 0 #print "!!! :", out, "|", err, "|", cmd else: coverage[i] = int(fields[3]) #print out, '>>coverage>>', coverage[i] else: print err """ coverageOK = bool(coverage[M] >= 15 and coverage[P] >= 15) contaminationOK = False try: contaminationOK = bool(float(coverage[M])/count_sum[M] >= 0.9 and float(coverage[P])/count_sum[P] >= 0.9) except: contaminationOK = False if coverageOK and not contaminationOK: print min_pos, "- contamination M:", coverage[M], posInfo[M+2][min_pos], alleles[M], "P:", coverage[P], posInfo[P+2][min_pos], alleles[P] #MOK = bool(callQ[M] >= 75 or coverage[M] >= 15) #POK = bool(callQ[P] >= 75 or coverage[P] >= 15) #if not (MOK and POK): #ignore positions that are not good enough if not (qualityOK and coverageOK and contaminationOK): #ignore positions that are not good enough ignored_pos += 1 for i in ALL_VCF: #if there is a SNP in the data at this position, skip it if min_chr == snps[i][0] and min_pos == snps[i][1]: #print min_pos, callQ[M], callQ[P], coverage[M], coverage[P], prev_lines[i], prev_lines[i] = '' #read input: next SNP for i in ALL_VCF: if min_chr >= snps[i][0] and min_pos >= snps[i][1]: snps[i] = getlineFromFile(in_files, out_files, i) #END WHILE print "Low quality positions ignored in the region:", ignored_pos
def main(): global prev_lines #parse ARGs parser = argparse.ArgumentParser( description= 'Filter SNP positions by call quality and min. coverage. Awaits filenames for M, P .vcf files, and M, P .sam files.' ) parser.add_argument( 'filenames', type=str, nargs='+', help= 'paths to .vcf files with M, P SNPs and to corresponding .sam files') args = parser.parse_args() if len(args.filenames) != 4: exit("Unexpected number of arguments passed! Expecting 4 filenames.") #treat these as CONSTANTS! M = 0 P = 1 MR = 2 PR = 3 ALL_VCF = [M, P] ALL = [M, P, MR, PR] ''' Get union of the M and P SNP positions ''' print " Stage 1" #list of input files in_files = [open(args.filenames[i], "r") for i in ALL] #dictionary to store the union of SNP positions data = dict() loci = dict() #read the first SNP from M, P vcf files snps = [[] for i in ALL_VCF] for i in ALL_VCF: #skip the header line = in_files[i].readline() while len(line) > 0 and line[0] == '#': line = in_files[i].readline() #split snps[i] = line.split('\t') #get list of all positions in UNION of M and P SNP positions while len(snps[M]) > 2 or len( snps[P]) > 2: #while there is a SNP positions in M or P #get the position of SNP that occure first for i in ALL_VCF: if snps[i][0] == '': #if an input files is already at EOF snps[i][0] = 'chrZZ' snps[i].append(1e15) else: #convert to int snps[i][1] = int(snps[i][1]) #chromosome min_chr = min(snps[M][0], snps[P][0]) #position min_pos = 1e15 for i in ALL_VCF: if min_chr == snps[i][0] and snps[i][1] < min_pos: min_pos = snps[i][1] #get alleles alleles = [['.', '.'] for x in [M, P]] for i in [M, P]: #if there is a SNP in the data at this position, use it if min_chr == snps[i][0] and min_pos == snps[i][1]: alleles[i] = [snps[i][3][0], snps[i][4][0]] #if there is no info on some allele, impute the reference allele ref_allele = alleles[M][0] if ref_allele == '.': ref_allele = alleles[P][0] for i in [M, P]: for a in [0, 1]: if alleles[i][a] == '.': alleles[i][a] = ref_allele #check for homozygous alternative sites for i in [M, P]: #if there is a SNP in the data at this position if min_chr == snps[i][0] and min_pos == snps[i][1]: info = snps[i] if len(info) <= 2: continue #parse out genotype config info gt = info[9].split(':')[0] #if homozygous alternative if gt[0] == '1': alleles[i][0] = snps[i][4][0] if gt[2] == '1': alleles[i][1] = snps[i][4][0] if gt[0] == '0': alleles[i][0] = snps[i][3][0] if gt[2] == '0': alleles[i][1] = snps[i][3][0] #take note that for this position we need to get allele counts in plasma samaples loci[min_pos] = alleles sp.add_pos(min_pos, data) #read input: next SNP for i in ALL_VCF: if min_chr >= snps[i][0] and min_pos >= snps[i][1]: snps[i] = in_files[i].readline().split('\t') #END WHILE ''' Get coverage information of the SNP positions from corresponding .sam files ''' print " Stage 2" #fetch allele support for the UNION positions in maternal and paternal reads #set up datastructures for counting allele support in diffrenct SAM files posInfo = [dict() for i in ALL] for R in [MR, PR]: posInfo[R] = copy.deepcopy(data) #fetch the reads in plasma SAM file and get counts for the positions originally specified in 'data' for R in [MR, PR]: while True: line = in_files[R].readline() if not line: break if len(line) > 0 and line[0] == '@': continue sp.pile_up(sp.mapping_parser(line), posInfo[R]) ''' Filter the SNP positions according to call quality and coverage ''' print " Stage 3" #reopen VCF files for f in in_files: f.close() in_files = [open(args.filenames[i], "r") for i in ALL_VCF] #list of output files out_files = [ open(args.filenames[i][:-3] + "ftr.vcf", "w") for i in ALL_VCF ] #read the first SNP from M, P vcf files snps = [[] for i in ALL_VCF] for i in ALL_VCF: snps[i] = getlineFromFile(in_files, out_files, i) #positions ignored from the union of M and P ignored_pos = 0 while len(snps[M]) > 2 or len( snps[P]) > 2: #while there is a SNP positions in M or P #get the position of SNP that occure first for i in ALL_VCF: if snps[i][0] == '': #if an input files is already at EOF snps[i][0] = 'chrZZ' snps[i].append(1e15) else: #convert to int snps[i][1] = int(snps[i][1]) #chromosome min_chr = min(snps[M][0], snps[P][0]) #position min_pos = 1e15 for i in ALL_VCF: if min_chr == snps[i][0] and snps[i][1] < min_pos: min_pos = snps[i][1] #get genotype call quality callQ = [0. for i in ALL_VCF] for i in ALL_VCF: #if there is a SNP in the data at this position if min_chr == snps[i][0] and min_pos == snps[i][1]: info = snps[i] if len(info) <= 2: continue #parse out quality info callQ[i] = float(info[5]) qualityOK = bool(callQ[M] >= 75 or callQ[P] >= 75) #get coverage info alleles = loci[min_pos] coverage = [0 for i in [M, P]] count_sum = [0 for i in [M, P]] for i in [M, P]: a1 = alleles[i][0] a2 = alleles[i][1] count_a1 = 0 count_a2 = 0 try: count_a1 = posInfo[i + 2][min_pos][a1] except: print i, min_pos, a1, posInfo[i + 2][min_pos], alleles[i] try: count_a2 = posInfo[i + 2][min_pos][a2] except: print i, min_pos, a2, posInfo[i + 2][min_pos], alleles[i] count_sum[i] = sum(posInfo[i + 2][min_pos].values()) coverage[ i] = count_a1 + count_a2 #posInfo[i+2][min_pos][a1] + posInfo[i+2][min_pos][a2] """ #using mpileup to get the coverage info cmd = 'samtools mpileup -r %(chr)s:%(pos)d-%(pos)d __%(gnm)s.part.bam' % {'chr':min_chr, 'pos':min_pos, 'gnm':'MP'[i]} process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = process.communicate() if process.returncode == 0: fields = out.split('\t') if len(fields) < 4: #there is no coverage in the BAM file for this position coverage[i] = 0 #print "!!! :", out, "|", err, "|", cmd else: coverage[i] = int(fields[3]) #print out, '>>coverage>>', coverage[i] else: print err """ coverageOK = bool(coverage[M] >= 15 and coverage[P] >= 15) contaminationOK = False try: contaminationOK = bool( float(coverage[M]) / count_sum[M] >= 0.9 and float(coverage[P]) / count_sum[P] >= 0.9) except: contaminationOK = False if coverageOK and not contaminationOK: print min_pos, "- contamination M:", coverage[M], posInfo[ M + 2][min_pos], alleles[M], "P:", coverage[P], posInfo[ P + 2][min_pos], alleles[P] #MOK = bool(callQ[M] >= 75 or coverage[M] >= 15) #POK = bool(callQ[P] >= 75 or coverage[P] >= 15) #if not (MOK and POK): #ignore positions that are not good enough if not (qualityOK and coverageOK and contaminationOK): #ignore positions that are not good enough ignored_pos += 1 for i in ALL_VCF: #if there is a SNP in the data at this position, skip it if min_chr == snps[i][0] and min_pos == snps[i][1]: #print min_pos, callQ[M], callQ[P], coverage[M], coverage[P], prev_lines[i], prev_lines[i] = '' #read input: next SNP for i in ALL_VCF: if min_chr >= snps[i][0] and min_pos >= snps[i][1]: snps[i] = getlineFromFile(in_files, out_files, i) #END WHILE print "Low quality positions ignored in the region:", ignored_pos
def main(): #parse ARGs parser = argparse.ArgumentParser(description='Prepare SNP support data for FCNV. Read filenames: for joined M&P phased .vcf file; plasma, M, and P .sam files; and for centromeres list.') parser.add_argument('filenames', type=str, nargs='+', help='paths to 1) .vcf file with phased M & P SNPs; 2) reads in SAM format for plasma, M, and P samples; 3) centromeres list file.') args = parser.parse_args() if len(args.filenames) != 5: exit("Unexpected number of arguments passed! Expecting 5 filenames.") #treat these as CONSTANTS! MP = 0; PLR = 1; MR = 2; PR = 3; CT = 4; #in_files ALL = [MP, PLR, MR, PR, CT] M = 0; P = 1; #maternal, paternal ALDOC = 0; GT = 1; #out_files: allele DOC and ground truth #list of input files in_files = [open(args.filenames[i], "r" ) for i in ALL] #read centromeres positions centromeres = dict() for line in in_files[CT].readlines(): line = line.rstrip('\n').split('\t') if line[0] not in centromeres.keys(): centromeres[line[0]] = [] centromeres[line[0]] += [(int(line[1]), int(line[2]))] #allele counts in plasma samples for particular positions pos_data = dict() loci = dict() processed_chr = '' skipped_in_centromere = 0 print " Getting union of SNP positions and corresponding list of alleles " + datetime.now().strftime('%m-%d-%H-%M') #read SNPs from M, P, F vcf files snps = [[] for i in [M, P]] #get genotypes for all positions in UNION of M and P SNP positions while True: line = in_files[MP].readline() #skip if part of the header while len(line) > 0 and line[0] == '#': line = in_files[MP].readline() if not line: break fields = line.rstrip('\n').split('\t') if processed_chr == '': processed_chr = 'chr'+fields[0] if processed_chr != 'chr'+fields[0]: print "WARNING: multiple chromosomes in the input", processed_chr, "|", 'chr'+fields[0] pos = int(fields[1]) ref = fields[3] alt = fields[4] #get M and P haplotypes snps[M] = fields[9].split(':')[0].split('|') for x in [0, 1]: if snps[M][x] == '0': snps[M][x] = ref else: snps[M][x] = alt snps[P] = fields[10].split(':')[0].split('|') for x in [0, 1]: if snps[P][x] == '0': snps[P][x] = ref else: snps[P][x] = alt #if in centromere region, skip centromere_regions = centromeres[processed_chr] if is_within_intervals(pos, centromere_regions): skipped_in_centromere += 1 continue #take note that for this position we need to get allele counts in plasma samaples alleles = (snps[M], snps[P]) loci[pos] = alleles sp.add_pos(pos, pos_data) #END WHILE print " Piling up the reads " + datetime.now().strftime('%m-%d-%H-%M') #set up datastructures for counting allele support in diffrenct SAM files posInfo = [dict() for i in ALL] for R in [PLR, MR, PR]: posInfo[R] = copy.deepcopy(pos_data) #fetch the reads in plasma SAM file and get counts for the positions originally specified in 'pos_data' for R in [PLR, MR, PR]: while True: line = in_files[R].readline() if not line: break if len(line) > 0 and line[0] == '@': continue sp.pile_up(sp.mapping_parser(line), posInfo[R]) print " Writing output " + datetime.now().strftime('%m-%d-%H-%M') #list of output files out_files = [None for i in [ALDOC, GT]] out_files[ALDOC] = open(processed_chr + "_alleles_docOWN.txt", "w") out_files[GT] = open(processed_chr + "_targetOWN.txt", "w") print >>out_files[ALDOC], '#POS\tA\tC\tG\tT\tM_hapA\tM_hapB\tDP_hapA\tDP_hapB\tP_hapA\tP_hapB\tDP_hapA\tDP_hapB' skipped_low_doc = 0 #print info / compute stats for each SNP position for pos in sorted(pos_data.keys()): alleles = loci[pos] #print the plasma allele counts nuc_counts = posInfo[PLR][pos] tmp = [] for nuc in 'ACGT': #to make sure they are in the right order try: tmp.append(str(nuc_counts[nuc])) except KeyError: tmp.append('0') #if the plasma coverage is too low, skip this position if sum(map(int, tmp)) < 50: print pos, "- low overall coverage", sum(map(int, tmp)) skipped_low_doc += 1 continue print >>out_files[ALDOC], str(pos) + '\t' + '\t'.join(tmp), #output M, P alleles at this SNP locus for i, r in [(M, MR), (P, PR)]: a1 = alleles[i][0] a2 = alleles[i][1] count_a1 = 0 count_a2 = 0 try: count_a1 = posInfo[r][pos][a1] except: print i, pos, a1, posInfo[r][pos], alleles[i] try: count_a2 = posInfo[r][pos][a2] except: print i, pos, a2, posInfo[r][pos], alleles[i] if a1 == a2: count_a1 /= 2. count_a2 /= 2. print >>out_files[ALDOC], '\t{0}\t{1}\t{2}\t{3}'.format(a1, a2, count_a1, count_a2), print >>out_files[ALDOC], '\n', if 10181440 <= pos and pos <= 10281440: print >>out_files[GT], '{0}\t{1}\t{2}\t{3}'.format(pos, 'N', 'N', 6) else: print >>out_files[GT], '{0}\t{1}\t{2}\t{3}'.format(pos, 'N', 'N', 3) print "Low overall coverage positions ignored:", skipped_low_doc print "Ignored positions in centromere regions:", skipped_in_centromere print "DONE " + datetime.now().strftime('%m-%d-%H-%M')
def main(): #parse ARGs parser = argparse.ArgumentParser( description= 'Analyze mixture and allele ratios for SNP positions in union(M, P). Read filenames for M, P .vcf files. Further M, F .sam files with reads that together form plasma reads.' ) parser.add_argument( 'filenames', type=str, nargs='+', help= 'paths to .vcf files with M, P SNPs and *sorted* M, F reads in SAM format' ) args = parser.parse_args() if len(args.filenames) != 4: die("Unexpected number of arguments passed! Expecting 4 filenames.") #treat these as CONSTANTS! M = 0 P = 1 MR = 2 FR = 3 ALL = [M, P, MR, FR] #list of input files in_files = [open(args.filenames[i], "r") for i in ALL] #list of output files #out_files = [None for i in ALL] #out_files[M] = open("M_alleles.txt", "w") #union of maternal and paternal SNP positions data = dict() loci = dict() #read SNPs from M, P, F vcf files snps = [[] for i in [M, P]] for i in [M, P]: #skip the header line = in_files[i].readline() while len(line) > 0 and line[0] == '#': line = in_files[i].readline() #split snps[i] = line.split('\t') #union the maternal and paternals SNPs positions while len(snps[M]) > 2 or len( snps[P]) > 2: #while there is a SNP positions in M or P #get the position of SNP that occure first for i in [M, P]: if snps[i][0] == '': #if an input files is already at EOF snps[i][0] = 'chrZZ' snps[i].append(1e15) else: #convert to int snps[i][1] = int(snps[i][1]) #chromosome min_chr = min(snps[M][0], snps[P][0]) #position min_pos = 1e15 for i in [M, P]: if min_chr == snps[i][0] and snps[i][1] < min_pos: min_pos = snps[i][1] #get alleles alleles = [['.', '.'] for x in [M, P]] for i in [M, P]: #if there is a SNP in the data at this position, use it if min_chr == snps[i][0] and min_pos == snps[i][1]: alleles[i] = [snps[i][3], snps[i][4]] #if there is no info on some allele, impute the reference allele ref_allele = alleles[M][0] if ref_allele == '.': ref_allele = alleles[P][0] for i in [M, P]: for a in [0, 1]: if alleles[i][a] == '.': alleles[i][a] = ref_allele #organize the haplotypes in M, P (phased VCF files) for i in [M, P]: #if there is a SNP in the data at this position if min_chr == snps[i][0] and min_pos == snps[i][1]: info = snps[i] if len(info) <= 2: continue #parse out haplotype config info ht = map(int, info[9].split('/')) #get the configuration phased_alleles = [alleles[i][ht[0]], alleles[i][ht[1]]] alleles[i] = phased_alleles #take note that for this position we need to get allele counts in plasma samaples sp.add_pos(min_pos, data) #loci[min_pos] = alleles #print min_pos, ": M:", alleles[M], " P:", alleles[P], " F:", alleles[F] #read input: next SNP for i in [M, P]: if min_chr >= snps[i][0] and min_pos >= snps[i][1]: snps[i] = in_files[i].readline().split('\t') #fetch the maternal and fetal portion of plasma reads from SAM files, # and get counts for the positions specified in 'data' posInfo = [dict() for i in ALL] posInfo[MR] = copy.deepcopy(data) posInfo[FR] = copy.deepcopy(data) for R in [MR, FR]: while True: line = in_files[R].readline() if not line: break if len(line) > 0 and line[0] == '@': continue sp.pile_up(sp.mapping_parser(line), posInfo[R]) #compute and print the stats for pos in sorted(data.keys()): MR_nuc_counts = posInfo[MR][pos] FR_nuc_counts = posInfo[FR][pos] try: local_mix_ratio = float(sum(FR_nuc_counts.values())) / ( sum(FR_nuc_counts.values()) + 9 * sum(MR_nuc_counts.values())) except ZeroDivisionError: local_mix_ratio = 0 print pos, local_mix_ratio, ''' for i, NC in enumerate([MR_nuc_counts, FR_nuc_counts]): tmp = [] for nuc in 'ACGT': #to make sure they are in the right order try: tmp.append(NC[nuc]) except KeyError: tmp.append(0) try: summ = float(sum(tmp)) tmp = [tmp[i]/summ for i in range(len(tmp))] except ZeroDivisionError: pass #tmp = [0] #print sorted(tmp), try: ind1 = 'ACGT'.index(loci[pos][i][0].upper()) ind2 = 'ACGT'.index(loci[pos][i][1].upper()) print abs(tmp[ind1] - tmp[ind2]), except IndexError: print "X", loci[pos][i][0].upper(), loci[pos][i][1].upper(), tmp, print 0, ''' print " "
def main(): #parse ARGs parser = argparse.ArgumentParser(description='Analyze mixture and allele ratios for SNP positions in union(M, P). Read filenames for M, P .vcf files. Further M, F .sam files with reads that together form plasma reads.') parser.add_argument('filenames', type=str, nargs='+', help='paths to .vcf files with M, P SNPs and *sorted* M, F reads in SAM format') args = parser.parse_args() if len(args.filenames) != 4: die("Unexpected number of arguments passed! Expecting 4 filenames.") #treat these as CONSTANTS! M = 0; P = 1; MR = 2; FR = 3; ALL = [M, P, MR, FR] #list of input files in_files = [open(args.filenames[i], "r" ) for i in ALL] #list of output files #out_files = [None for i in ALL] #out_files[M] = open("M_alleles.txt", "w") #union of maternal and paternal SNP positions data = dict() loci = dict() #read SNPs from M, P, F vcf files snps = [[] for i in [M, P]] for i in [M, P]: #skip the header line = in_files[i].readline() while len(line) > 0 and line[0] == '#': line = in_files[i].readline() #split snps[i] = line.split('\t') #union the maternal and paternals SNPs positions while len(snps[M])>2 or len(snps[P])>2: #while there is a SNP positions in M or P #get the position of SNP that occure first for i in [M, P]: if snps[i][0] == '': #if an input files is already at EOF snps[i][0] = 'chrZZ' snps[i].append(1e15) else: #convert to int snps[i][1] = int(snps[i][1]) #chromosome min_chr = min(snps[M][0], snps[P][0]) #position min_pos = 1e15 for i in [M, P]: if min_chr == snps[i][0] and snps[i][1] < min_pos: min_pos = snps[i][1] #get alleles alleles = [['.', '.'] for x in [M, P]] for i in [M, P]: #if there is a SNP in the data at this position, use it if min_chr == snps[i][0] and min_pos == snps[i][1]: alleles[i] = [snps[i][3], snps[i][4]] #if there is no info on some allele, impute the reference allele ref_allele = alleles[M][0] if ref_allele == '.': ref_allele = alleles[P][0] for i in [M, P]: for a in [0, 1]: if alleles[i][a] == '.': alleles[i][a] = ref_allele #organize the haplotypes in M, P (phased VCF files) for i in [M, P]: #if there is a SNP in the data at this position if min_chr == snps[i][0] and min_pos == snps[i][1]: info = snps[i] if len(info) <= 2: continue #parse out haplotype config info ht = map(int, info[9].split('/')) #get the configuration phased_alleles = [alleles[i][ht[0]], alleles[i][ht[1]]] alleles[i] = phased_alleles #take note that for this position we need to get allele counts in plasma samaples sp.add_pos(min_pos, data) #loci[min_pos] = alleles #print min_pos, ": M:", alleles[M], " P:", alleles[P], " F:", alleles[F] #read input: next SNP for i in [M, P]: if min_chr >= snps[i][0] and min_pos >= snps[i][1]: snps[i] = in_files[i].readline().split('\t') #fetch the maternal and fetal portion of plasma reads from SAM files, # and get counts for the positions specified in 'data' posInfo = [dict() for i in ALL] posInfo[MR] = copy.deepcopy(data) posInfo[FR] = copy.deepcopy(data) for R in [MR, FR]: while True: line = in_files[R].readline() if not line: break if len(line) > 0 and line[0] == '@': continue sp.pile_up(sp.mapping_parser(line), posInfo[R]) #compute and print the stats for pos in sorted(data.keys()): MR_nuc_counts = posInfo[MR][pos] FR_nuc_counts = posInfo[FR][pos] try: local_mix_ratio = float(sum(FR_nuc_counts.values())) / (sum(FR_nuc_counts.values()) + 9*sum(MR_nuc_counts.values())) except ZeroDivisionError: local_mix_ratio = 0 print pos, local_mix_ratio, ''' for i, NC in enumerate([MR_nuc_counts, FR_nuc_counts]): tmp = [] for nuc in 'ACGT': #to make sure they are in the right order try: tmp.append(NC[nuc]) except KeyError: tmp.append(0) try: summ = float(sum(tmp)) tmp = [tmp[i]/summ for i in range(len(tmp))] except ZeroDivisionError: pass #tmp = [0] #print sorted(tmp), try: ind1 = 'ACGT'.index(loci[pos][i][0].upper()) ind2 = 'ACGT'.index(loci[pos][i][1].upper()) print abs(tmp[ind1] - tmp[ind2]), except IndexError: print "X", loci[pos][i][0].upper(), loci[pos][i][1].upper(), tmp, print 0, ''' print " "
def main(): #parse ARGs parser = argparse.ArgumentParser( description= 'Prepare SNP data for FCNV. Read filenames: for M, P, and F .vcf files; plasma, M, and P .sam files; and for centromeres list.' ) parser.add_argument( 'filenames', type=str, nargs='+', help= 'paths to 1) .vcf files with M, P, F SNPs; 2) reads in SAM format for plasma, M, and P samples; 3) centromeres list file.' ) args = parser.parse_args() if len(args.filenames) != 7: exit("Unexpected number of arguments passed! Expecting 7 filenames.") #treat these as CONSTANTS! M = 0 P = 1 F = 2 PLASMA = 3 MR = 4 PR = 5 CT = 6 ALL = [M, P, F, PLASMA, MR, PR, CT] #list of input files in_files = [open(args.filenames[i], "r") for i in ALL] #list of output files out_files = [None for i in [M, P, F, PLASMA]] out_files[M] = open("M_alleles.txt", "w") out_files[P] = open("P_alleles.txt", "w") out_files[F] = open("F_alleles.txt", "w") out_files[PLASMA] = open("plasma_samples.txt", "w") date = datetime.now().strftime('%m-%d-%H-%M') out_pos_file = open("positions" + date + ".txt", "w") #read centromeres positions centromeres = dict() for line in in_files[CT].readlines(): line = line.rstrip('\n').split('\t') if line[0] not in centromeres.keys(): centromeres[line[0]] = [] centromeres[line[0]] += [(int(line[1]), int(line[2]))] #allele counts in plasma samples for particular positions data = dict() loci = dict() processed_chr = '' print " Getting union of SNP positions and corresponding list of alleles" #read SNPs from M, P, F vcf files snps = [[] for i in [M, P, F]] for i in [M, P, F]: #skip the header line = in_files[i].readline() while len(line) > 0 and line[0] == '#': line = in_files[i].readline() #split snps[i] = line.split('\t') #get genotypes for all positions in UNION of M and P SNP positions while len(snps[M]) > 2 or len( snps[P]) > 2: #while there is a SNP positions in M or P #get the position of SNP that occure first for i in [M, P, F]: if snps[i][0] == '': #if an input files is already at EOF snps[i][0] = 'chrZZ' snps[i].append(1e15) else: #convert to int snps[i][1] = int(snps[i][1]) #chromosome min_chr = min(snps[M][0], snps[P][0]) if processed_chr == '': processed_chr = min_chr if processed_chr != min_chr: print "WARNING: multiple chromosomes in the input", processed_chr, "|", min_chr #position min_pos = 1e15 for i in [M, P]: if min_chr == snps[i][0] and snps[i][1] < min_pos: min_pos = snps[i][1] #get alleles alleles = [['.', '.'] for x in [M, P, F]] for i in [M, P, F]: #if there is a SNP in the data at this position, use it if min_chr == snps[i][0] and min_pos == snps[i][1]: alleles[i] = [snps[i][3][0], snps[i][4][0]] #if there is no info on some allele, impute the reference allele ref_allele = alleles[M][0] if ref_allele == '.': ref_allele = alleles[P][0] for i in [M, P, F]: for a in [0, 1]: if alleles[i][a] == '.': alleles[i][a] = ref_allele #check for homozygous alternative sites in Fetal VCF for i in [F]: #if there is a SNP in the data at this position if min_chr == snps[i][0] and min_pos == snps[i][1]: info = snps[i] if len(info) <= 2: continue #parse out genotype config info gt = info[9].split(':')[0] #if homozygous alternative if gt[0] == '1': alleles[i][0] = snps[i][4][0] if gt[2] == '1': alleles[i][1] = snps[i][4][0] if gt[0] == '0': alleles[i][0] = snps[i][3][0] if gt[2] == '0': alleles[i][1] = snps[i][3][0] #organize the haplotypes in M, P (phased VCF files) for i in [M, P]: #if there is a SNP in the data at this position if min_chr == snps[i][0] and min_pos == snps[i][1]: info = snps[i] if len(info) <= 2: continue #parse out haplotype config info ht = map(int, info[9].split('/')) #get the configuration phased_alleles = [alleles[i][ht[0]], alleles[i][ht[1]]] alleles[i] = phased_alleles #take note that for this position we need to get allele counts in plasma samaples loci[min_pos] = alleles sp.add_pos(min_pos, data) #read input: next SNP for i in [M, P, F]: if min_chr >= snps[i][0] and min_pos >= snps[i][1]: snps[i] = in_files[i].readline().split('\t') #END WHILE print " Aligning the reads" #set up datastructures for counting allele support in diffrenct SAM files posInfo = [dict() for i in ALL] for R in [PLASMA, MR, PR]: posInfo[R] = copy.deepcopy(data) #fetch the reads in plasma SAM file and get counts for the positions originally specified in 'data' for R in [PLASMA, MR, PR]: while True: line = in_files[R].readline() if not line: break if len(line) > 0 and line[0] == '@': continue sp.pile_up(sp.mapping_parser(line), posInfo[R]) print " Writing output" skipped_in_centromere = 0 skipped_low = 0 centromere_regions = centromeres[processed_chr] #print info / compute stats for each SNP position for pos in sorted(data.keys()): alleles = loci[pos] #if alleles[M][0] != alleles[M][1]: #print the plasma allele counts nuc_counts = posInfo[PLASMA][pos] tmp = [] for nuc in 'ACGT': #to make sure they are in the right order try: tmp.append(str(nuc_counts[nuc])) except KeyError: tmp.append('0') #if the plasma coverage is too low, skip this position if sum(map(int, tmp)) < 20: print pos, "- low overall coverage", sum(map(int, tmp)) skipped_low += 1 continue #if in centromere region, skip if is_within_intervals(pos, centromere_regions): skipped_in_centromere += 1 continue print >> out_files[PLASMA], ' '.join(tmp) #output M, P, F alleles at this SNP locus for i, r in [(M, MR), (P, PR)]: a1 = alleles[i][0] a2 = alleles[i][1] count_a1 = 0 count_a2 = 0 try: count_a1 = posInfo[r][pos][a1] except: print i, pos, a1, posInfo[r][pos], alleles[i] try: count_a2 = posInfo[r][pos][a2] except: print i, pos, a2, posInfo[r][pos], alleles[i] if a1 == a2: count_a1 /= 2. count_a2 /= 2. print >> out_files[i], a1, a2, count_a1, count_a2 print >> out_files[F], alleles[F][0], alleles[F][1], 3 print >> out_pos_file, pos, "- M:", alleles[M], " P:", alleles[ P], " F:", alleles[F] print "Low overall coverage positions ignored:", skipped_low print "Ignored positions in centromere regions:", skipped_in_centromere
def main(): #parse ARGs parser = argparse.ArgumentParser(description='Prepare SNP data for FCNV. Read filenames: for M, P, and F .vcf files; plasma, M, and P .sam files; and for centromeres list.') parser.add_argument('filenames', type=str, nargs='+', help='paths to 1) .vcf files with M, P, F SNPs; 2) reads in SAM format for plasma, M, and P samples; 3) centromeres list file.') args = parser.parse_args() if len(args.filenames) != 7: exit("Unexpected number of arguments passed! Expecting 7 filenames.") #treat these as CONSTANTS! M = 0; P = 1; F = 2; PLASMA = 3; MR = 4; PR = 5; CT = 6; ALL = [M, P, F, PLASMA, MR, PR, CT] #list of input files in_files = [open(args.filenames[i], "r" ) for i in ALL] #list of output files out_files = [None for i in [M, P, F, PLASMA]] out_files[M] = open("M_alleles.txt", "w") out_files[P] = open("P_alleles.txt", "w") out_files[F] = open("F_alleles.txt", "w") out_files[PLASMA] = open("plasma_samples.txt", "w") date = datetime.now().strftime('%m-%d-%H-%M') out_pos_file = open("positions" + date + ".txt", "w") #read centromeres positions centromeres = dict() for line in in_files[CT].readlines(): line = line.rstrip('\n').split('\t') if line[0] not in centromeres.keys(): centromeres[line[0]] = [] centromeres[line[0]] += [(int(line[1]), int(line[2]))] #allele counts in plasma samples for particular positions data = dict() loci = dict() processed_chr = '' print " Getting union of SNP positions and corresponding list of alleles" #read SNPs from M, P, F vcf files snps = [[] for i in [M, P, F]] for i in [M, P, F]: #skip the header line = in_files[i].readline() while len(line) > 0 and line[0] == '#': line = in_files[i].readline() #split snps[i] = line.split('\t') #get genotypes for all positions in UNION of M and P SNP positions while len(snps[M])>2 or len(snps[P])>2: #while there is a SNP positions in M or P #get the position of SNP that occure first for i in [M, P, F]: if snps[i][0] == '': #if an input files is already at EOF snps[i][0] = 'chrZZ' snps[i].append(1e15) else: #convert to int snps[i][1] = int(snps[i][1]) #chromosome min_chr = min(snps[M][0], snps[P][0]) if processed_chr == '': processed_chr = min_chr if processed_chr != min_chr: print "WARNING: multiple chromosomes in the input", processed_chr, "|", min_chr #position min_pos = 1e15 for i in [M, P]: if min_chr == snps[i][0] and snps[i][1] < min_pos: min_pos = snps[i][1] #get alleles alleles = [['.', '.'] for x in [M, P, F]] for i in [M, P, F]: #if there is a SNP in the data at this position, use it if min_chr == snps[i][0] and min_pos == snps[i][1]: alleles[i] = [snps[i][3][0], snps[i][4][0]] #if there is no info on some allele, impute the reference allele ref_allele = alleles[M][0] if ref_allele == '.': ref_allele = alleles[P][0] for i in [M, P, F]: for a in [0, 1]: if alleles[i][a] == '.': alleles[i][a] = ref_allele #check for homozygous alternative sites in Fetal VCF for i in [F]: #if there is a SNP in the data at this position if min_chr == snps[i][0] and min_pos == snps[i][1]: info = snps[i] if len(info) <= 2: continue #parse out genotype config info gt = info[9].split(':')[0] #if homozygous alternative if gt[0] == '1': alleles[i][0] = snps[i][4][0] if gt[2] == '1': alleles[i][1] = snps[i][4][0] if gt[0] == '0': alleles[i][0] = snps[i][3][0] if gt[2] == '0': alleles[i][1] = snps[i][3][0] #organize the haplotypes in M, P (phased VCF files) for i in [M, P]: #if there is a SNP in the data at this position if min_chr == snps[i][0] and min_pos == snps[i][1]: info = snps[i] if len(info) <= 2: continue #parse out haplotype config info ht = map(int, info[9].split('/')) #get the configuration phased_alleles = [alleles[i][ht[0]], alleles[i][ht[1]]] alleles[i] = phased_alleles #take note that for this position we need to get allele counts in plasma samaples loci[min_pos] = alleles sp.add_pos(min_pos, data) #read input: next SNP for i in [M, P, F]: if min_chr >= snps[i][0] and min_pos >= snps[i][1]: snps[i] = in_files[i].readline().split('\t') #END WHILE print " Aligning the reads" #set up datastructures for counting allele support in diffrenct SAM files posInfo = [dict() for i in ALL] for R in [PLASMA, MR, PR]: posInfo[R] = copy.deepcopy(data) #fetch the reads in plasma SAM file and get counts for the positions originally specified in 'data' for R in [PLASMA, MR, PR]: while True: line = in_files[R].readline() if not line: break if len(line) > 0 and line[0] == '@': continue sp.pile_up(sp.mapping_parser(line), posInfo[R]) print " Writing output" skipped_in_centromere = 0 skipped_low = 0 centromere_regions = centromeres[processed_chr] #print info / compute stats for each SNP position for pos in sorted(data.keys()): alleles = loci[pos] #if alleles[M][0] != alleles[M][1]: #print the plasma allele counts nuc_counts = posInfo[PLASMA][pos] tmp = [] for nuc in 'ACGT': #to make sure they are in the right order try: tmp.append(str(nuc_counts[nuc])) except KeyError: tmp.append('0') #if the plasma coverage is too low, skip this position if sum(map(int, tmp)) < 20: print pos, "- low overall coverage", sum(map(int, tmp)) skipped_low += 1 continue #if in centromere region, skip if is_within_intervals(pos, centromere_regions): skipped_in_centromere += 1 continue print >>out_files[PLASMA], ' '.join(tmp) #output M, P, F alleles at this SNP locus for i, r in [(M, MR), (P, PR)]: a1 = alleles[i][0] a2 = alleles[i][1] count_a1 = 0 count_a2 = 0 try: count_a1 = posInfo[r][pos][a1] except: print i, pos, a1, posInfo[r][pos], alleles[i] try: count_a2 = posInfo[r][pos][a2] except: print i, pos, a2, posInfo[r][pos], alleles[i] if a1 == a2: count_a1 /= 2. count_a2 /= 2. print >>out_files[i], a1, a2, count_a1, count_a2 print >>out_files[F], alleles[F][0], alleles[F][1], 3 print >>out_pos_file, pos, "- M:", alleles[M], " P:", alleles[P], " F:", alleles[F] print "Low overall coverage positions ignored:", skipped_low print "Ignored positions in centromere regions:", skipped_in_centromere