def main(): usage = """%prog <VCF file 1> <VCF file 2> ... <VCF file N> l_sort Author: Ryan Layer, Colby Chiang, & Ira Hall Description: sort N VCF files into a single file Version: 0.01 """ if len(sys.argv) < 2: exit(usage) vcf_file_names = sys.argv[1:] vcf_lines = [] vcf_headers = list() for vcf_file_name in vcf_file_names: samples = l_bp.parse_vcf(vcf_file_name, vcf_lines, vcf_headers) for sample in samples: vcf_headers.append("##SAMPLE=<ID=" + sample + ">\n") vcf_headers.append("##INFO=<ID=SNAME,Number=.,Type=String," + \ "Description=\"Source sample name\">\n") vcf_headers.append("##INFO=<ID=ALG,Number=1,Type=String," + \ "Description=\"Evidence PDF aggregation algorithm\">\n") vcf_headers.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\t" + \ "VARIOUS\n") vcf_headers = list(vcf_headers) vcf_headers.sort(cmp=l_bp.header_line_cmp) for h in vcf_headers: print(h, end=' ') vcf_lines.sort(cmp=l_bp.vcf_line_cmp) for v in vcf_lines: # if 'SVTYPE=BND' in v and (('--:' in v) != ('++' in v)): # A = v.split('\t') # neg_s = A[7].find('--:') # pos_s = A[7].find('++:') # # if neg_s > 0: # neg_e = neg_s + A[7][neg_s:].find(';') # pre=A[7][:neg_s] # mid=A[7][neg_s:neg_e] # post=A[7][neg_e:] # A[7] = pre + '++:0,' + mid + post # else: # pos_e = pos_s + A[7][pos_s:].find(';') # pre=A[7][:pos_s] # mid=A[7][pos_s:pos_e] # post=A[7][pos_e:] # A[7] = pre + mid + ',--:0' + post # print '\t'.join(A) # else: print(v, end=' ')
def main(): usage ="""%prog <VCF file 1> <VCF file 2> ... <VCF file N> l_sort Author: Ryan Layer, Colby Chiang, & Ira Hall Description: sort N VCF files into a single file Version: 0.01 """ if len(sys.argv) < 2: exit(1) vcf_file_names = sys.argv[1:] vcf_lines = [] vcf_headers = list() for vcf_file_name in vcf_file_names: samples = l_bp.parse_vcf(vcf_file_name, vcf_lines, vcf_headers) for sample in samples: vcf_headers.append("##SAMPLE=<ID=" + sample + ">\n") vcf_headers.append("##INFO=<ID=SNAME,Number=.,Type=String," + \ "Description=\"Source sample name\">\n") vcf_headers.append("##INFO=<ID=ALG,Number=1,Type=String," + \ "Description=\"Evidence PDF aggregation algorithm\">\n") vcf_headers.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\t" + \ "VARIOUS\n") vcf_headers = list(vcf_headers) vcf_headers.sort(cmp=l_bp.header_line_cmp) for h in vcf_headers: print h, vcf_lines.sort(cmp=l_bp.vcf_line_cmp) for v in vcf_lines: # if 'SVTYPE=BND' in v and (('--:' in v) != ('++' in v)): # A = v.split('\t') # neg_s = A[7].find('--:') # pos_s = A[7].find('++:') # # if neg_s > 0: # neg_e = neg_s + A[7][neg_s:].find(';') # pre=A[7][:neg_s] # mid=A[7][neg_s:neg_e] # post=A[7][neg_e:] # A[7] = pre + '++:0,' + mid + post # else: # pos_e = pos_s + A[7][pos_s:].find(';') # pre=A[7][:pos_s] # mid=A[7][pos_s:pos_e] # post=A[7][pos_e:] # A[7] = pre + mid + ',--:0' + post # print '\t'.join(A) # else: print v,
def l_cluster(file_name, percent_slop=0, fixed_slop=0): v_id = 0 vcf_lines = [] vcf_headers = Set() r = l_bp.parse_vcf(file_name, vcf_lines, vcf_headers, add_sname=False) vcf_headers.add("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n") sample_order = [] for header in vcf_headers: if header[:8] == '##SAMPLE': sample_order.append(header.rstrip()[13:-1]) #elif header[:8] == '##FORMAT': #i,n,t=header[header.find('<')+1:header.find('>')].split(',')[0:3] #print i,n,t #exit(1) vcf_headers = list(vcf_headers) vcf_headers.sort(cmp=l_bp.header_line_cmp) for h in vcf_headers: print h, BP_l = [] BP_sv_type = '' BP_max_end_l = -1 BP_chr_l = '' for l in vcf_lines: b = l_bp.breakpoint(l, percent_slop=percent_slop, fixed_slop=fixed_slop) if (len(BP_l) == 0) or \ ((b.start_l <= BP_max_end_l) and \ (b.chr_l == BP_chr_l) and \ (b.sv_type == BP_sv_type)): BP_l.append(b) BP_max_end_l = max(BP_max_end_l, b.end_l) BP_chr_l = b.chr_l BP_sv_type = b.sv_type else: #print len(BP_l) v_id = r_cluster(BP_l, sample_order, v_id) BP_l = [b] BP_max_end_l = b.end_l BP_sv_type = b.sv_type BP_chr_l = b.chr_l if len(BP_l) > 0: #print len(BP_l) v_id = r_cluster(BP_l, sample_order, v_id)
def main(): usage ="""%prog <VCF file 1> <VCF file 2> ... <VCF file N> l_sort Author: Ryan Layer, Colby Chiang, & Ira Hall Description: sort N VCF files into a single file Version: 0.01 """ if len(sys.argv) < 2: exit(1) vcf_file_names = sys.argv[1:] vcf_lines = [] vcf_headers = Set() vcf_headers.add("##INFO=<ID=SVNAME,Number=.,Type=String," + \ "Description=\"Source sample name\">\n") vcf_headers.add("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\t" + \ "VARIOUS\n") for vcf_file_name in vcf_file_names: samples = l_bp.parse_vcf(vcf_file_name, vcf_lines, vcf_headers) for sample in samples: vcf_headers.add("##SAMPLE=<ID=" + sample + ">\n") vcf_headers = list(vcf_headers) vcf_headers.sort(cmp=l_bp.header_line_cmp) for h in vcf_headers: print h, vcf_lines.sort(cmp=l_bp.vcf_line_cmp) for v in vcf_lines: print v,