def main(): try: invcf = sys.argv[1] except: print "<invcf>" print "EG.: csv_list_multicolumn.py.py 1001genomes_snp-short-indel_only_ACGTN.vcf.gz" sys.exit(1) print "input vcf %s" % invcf checkfile(invcf) names = None with openfile(invcf, 'r') as fhdi: with open(invcf + '.list.csv', 'wb') as fhdo: writer = csv.writer(fhdo, delimiter='\t', quotechar='"') for line in fhdi: line = line.strip() if len(line) == 0: continue if line.startswith("#"): # header print "HEADER", line if line.startswith("##"): # definition lines print "HEADER :: DEF", line else: # column description print "HEADER :: COL", line cols = line.split("\t") num_cols = len(cols) shared = cols[: 9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT names = cols[9:] print "HEADER :: COL :: SHARED", shared print "HEADER :: COL :: NAMES", names for ln, name in enumerate(names): cols = ["1", "%s|%d" % (invcf, ln + 1), name] writer.writerow(cols) break
def main(): try: invcf = sys.argv[1] except: print "<invcf>" print "EG.: csv_list_multicolumn.py.py 1001genomes_snp-short-indel_only_ACGTN.vcf.gz" sys.exit(1) print "input vcf %s" % invcf checkfile(invcf) names = None with openfile(invcf, 'r') as fhdi: with open(invcf + '.list.csv', 'wb') as fhdo: writer = csv.writer(fhdo, delimiter='\t', quotechar='"') for line in fhdi: line = line.strip() if len(line) == 0: continue if line.startswith("#"): # header print "HEADER", line if line.startswith("##"): # definition lines print "HEADER :: DEF", line else: # column description print "HEADER :: COL", line cols = line.split("\t") num_cols = len(cols) shared = cols[:9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT names = cols[9:] print "HEADER :: COL :: SHARED", shared print "HEADER :: COL :: NAMES" , names for ln, name in enumerate(names): cols = ["1", "%s|%d" % (invcf, ln+1), name] writer.writerow(cols) break
def open(self): """ Open GFF file """ self.fhd = filemanager.openfile(self.infile, 'r')
def main(incsv, translation_str): outfile = incsv + '.vcf.gz' if not os.path.exists( incsv ): print "input file does not exists. quitting like a whimp" sys.exit( 1 ) print "reading %s" % incsv translation = {} if translation_str is not None: for pair in translation_str.split(';'): src, dst = pair.split(':') assert src not in translation translation[ src ] = dst print "Translation", translation if os.path.exists( outfile ): print "output file %s exists. quitting like a whimp" % outfile sys.exit( 1 ) print "saving to %s" % outfile data = vcfHeap(translation=translation) cfh = openfile(incsv, 'r') for line in cfh: if line[0] == "#": continue line = line.strip() cols = line.split('\t') assert len(cols) >= 2 print cols, cols[:3] data.addFile(*cols[:3]) mfh = openvcffile(outfile + '.tmp.vcf.gz', 'w', compresslevel=1) mfh.write( data.getVcfHeader() ) mfh.flush() num_lines = 0 print_every = 1000 lines = [] while not data.isempty(): val = data.next() if val is not None: # if not empty lines.append( str( val ) ) if len( lines ) % (print_every/100) == 0: sys.stdout.write('.' ) #sys.stdout.write(' {:14,d}\n'.format(len(lines))) #break if len( lines ) % print_every == 0: num_lines += len( lines ) sys.stdout.write(' {:14,d}\n'.format(num_lines)) #break mfh.write( "".join( lines ) ) mfh.flush() lines = [] sys.stdout.flush() else: print "val is empty" break mfh.write( "".join( lines ) ) mfh.flush() mfh.close() num_lines += len( lines ) lines = [] sys.stdout.write('\nTotal {:14,d}\n'.format(num_lines)) os.rename(outfile + '.tmp.vcf.gz', outfile) print "Finished" return outfile
def main(args): parser = argparse.ArgumentParser(description='Simplify merged VCF file.') parser.add_argument('-i', '--input', dest='input', required=True, nargs='?', type=str, help='Input file') parser.add_argument('-o', '--output', dest='output', default=None, nargs='?', type=str, help='Output file') parser.add_argument('-t', '--table', dest='table', default=None, nargs='?', type=str, help='Input table') parser.add_argument('-k', '--keys', dest='keys', default=None, nargs='?', type=str, help='Input keys') parser.add_argument('-v', '--table-values', dest='table_vs', default=None, nargs='?', type=str, help='Input table values') parser.add_argument( '-c', '--chromosome-translation', dest='translation', default=None, nargs='?', type=str, help='Translation table to chromosome names [e.g.: 1:Chr1;2:Chr2') parser.add_argument('-s', '--samples', dest='samples', default=None, nargs='?', type=str, help='Samples (Columns) to keep [e.g.: Spp1;Spp3;Spp5') parser.add_argument('-n', '--keep-no-coverage', dest='keep_no_coverage', action='store_true', help='Keep rows containing no coverage') parser.add_argument('-e', '--keep-heterozygous', dest='keep_heterozygous', action='store_true', help='Keep rows hoterozygosity') options = parser.parse_args(args) print "Options", options invcf = options.input try: checkfile(invcf) print "input vcf: %s" % invcf except: parser.print_usage() #print "%s --input <invcf>" % sys.argv[0] print "EG.: %s --input 1001genomes_snp-short-indel_only_ACGTN.vcf.gz" % sys.argv[ 0] sys.exit(1) outbn = invcf if options.output is not None: outbn = options.output outbn += (".nc" if options.keep_no_coverage else "") + (".het" if options.keep_heterozygous else "") listFile = outbn + '.list.csv' vcfFile = outbn + '.list.csv.vcf.gz' outFile = outbn + '.list.csv.vcf.gz.simplified.vcf.gz' outFileTmp = outbn + '.list.csv.vcf.gz.simplified.tmp.vcf.gz' if os.path.exists(outFile): print "Out File (%s) EXISTS. quitting" % outFile sys.exit(1) print "Out File: %s" % outFile try: intbl = options.table checkfile(intbl) print "Input Table: %s" % intbl except: intbl = None tbl_k = None if options.keys is not None: tbl_k = options.keys print "Input Table keys: %s" % tbl_k tbl_vs = None if options.table_vs is not None: tbl_vs = options.table_vs.split(',') print "Table values: %s" % options.table_vs data, atad = (None, None) if intbl: data, atad = get_translation(intbl, tbl_k, tbl_vs) print 'DATA', data print 'ATAD', atad translation = {} if options.translation is not None: for pair in options.translation.split(';'): src, dst = pair.split(':') assert src not in translation translation[src] = dst print "Translation", translation else: translation = None columns = None if options.samples is not None: columns = options.samples.split(';') assert len(columns) > 0, "No Columns %s" % str(columns) vcf_holder = vcf() names = None with openfile(invcf, 'r') as fhdi: with openvcffile(outFileTmp, 'w', compresslevel=1) as fhdv: vcf_holder.setFhd(fhdv) for line in fhdi: line = line.strip() if len(line) == 0: continue if line.startswith("#"): # header print "HEADER", line if line.startswith("##"): # definition lines print "HEADER :: DEF", line else: # column description print "HEADER :: COL", line cols = line.split("\t") num_cols = len(cols) shared = cols[: 9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT names = cols[9:] print "HEADER :: COL :: SHARED", shared #print "HEADER :: COL :: NAMES" , names if columns is not None: cdiff = list(set(columns) - set(names)) assert len( cdiff) == 0, "Unknown column name: %s" % ( str(cdiff)) with open(listFile, 'wb') as fhdl: writer = csv.writer(fhdl, delimiter='\t', quotechar='"') for ln, name in enumerate(names): if columns is not None: if name not in columns: continue cols = ["1", "%s|%d" % (invcf, ln + 1), name] if data is not None: assert name in data, "name %s not in db %s" % ( name, str(data)) #print "converting %s to %s" % (name, data[name]) cols[2] = data[name] names[ln] = data[name] #print "COLS", cols writer.writerow(cols) print "HEADER :: COL :: NAMES", names vcf_holder.printVcfHeader(names) else: cols = line.split("\t") assert len(cols) > 9 info = cols[8] assert ':' in info, line assert 'GT' in info, line #print "has desc" infoC = info.split(':') assert len(infoC) > 1 #print " info" , info #print " infoC", infoC gtpos = info.index('GT') #print " GT pos", gtpos register = { 'chrom': cols[0], 'pos': int(cols[1]), 'src': cols[3], 'dst': cols[4], 'desc': {}, 'stats': { 'unphased': 0, 'phased': 0, 'gap': 0, 'ref': 0, 'h**o': 0, 'het': 0, 'x_mnp_ref': 0, 'x_mnp_alt': 0, 'x_gap': 0, 'x_het': 0 } } if len(cols[3]) > 1: print "MNP ref", cols[3] vcf_holder.add_stat(cols[0], 'x_mnp_ref', 1) continue if any([len(x) != 1 for x in cols[4].split(',')]): print "MNP alt", cols[4] vcf_holder.add_stat(cols[0], 'x_mnp_alt', 1) continue #descs = cols[9:] has_gap = False is_het = False for colNum, desc in enumerate(cols[9:]): colname = names[colNum] if columns is not None: if colname not in columns: continue if (desc == './.') or (desc == '.'): if not options.keep_no_coverage: vcf_holder.add_stat(cols[0], 'x_gap', 1) has_gap = True break else: register['stats']['gap'] += 1 continue assert ':' in desc, desc + " " + str(cols[9:]) descC = desc.split(":") assert len(descC) > 1 #print " desc" , desc #print " descC", descC #assert len(infoC) == len(descC), str(infoC) + " " + str(descC) + " " + str(cols[9:]) if len(infoC) != len(descC): if not options.keep_no_coverage: vcf_holder.add_stat(cols[0], 'x_gap', 1) has_gap = True break else: register['stats']['gap'] += 1 continue #print " len infoC == len descC", infoC, descC gtDesc = descC[gtpos] gt0, gt1 = (None, None) if '/' in gtDesc: gt0, gt1 = gtDesc.split('/') register['stats']['unphased'] += 1 elif '|' in gtDesc: gt0, gt1 = gtDesc.split('|') register['stats']['phased'] += 1 else: assert False, 'unknown info fomat: %s (%s, %s)' % ( gtDesc, info, desc) if gt0 == '.' or gt1 == '.': # skip no coverage #sys.stdout.write('.') if not options.keep_no_coverage: register['stats']['uncalled'] += 1 has_gap = True break else: vcf_holder.add_stat(cols[0], 'x_gap', 1) continue else: if len(set([gt0, gt1])) == 1: #sys.stdout.write('o') register['stats']['h**o'] += 1 if (gt0 == '0' ): # homozygous identical to reference register['stats']['ref'] += 1 continue #register['desc' ].append( names[colNum] ) else: #sys.stdout.write('e') if not options.keep_heterozygous: vcf_holder.add_stat(cols[0], 'x_het', 1) is_het = True break else: register['stats']['het'] += 1 continue dstC = register['dst'].split(',') nuc0 = register['src'] if gt0 == '0' else dstC[ int(gt0) - 1] nuc1 = register['src'] if gt1 == '0' else dstC[ int(gt1) - 1] nucK = (nuc0, nuc1) if nucK not in register['desc']: register['desc'][nucK] = [] register['desc'][nucK].append(names[colNum]) #if gt0 == '0' or gt1 == '0': # if heretozygous and has reference, make it explicit # #sys.stdout.write('H') # alts = sorted(list(set(register['src' ].split(",") + register['dst' ].split(",")))) # alts = [ a for a in alts if a != '.' ] # register['dst' ] = ",".join(alts) # #print " added src to dst", self.register['dst' ] #register['desc' ].append( names[colNum] ) #sys.stdout.flush() if has_gap: continue if is_het: continue if len(register['desc']) > 0: #print '+\n' if translation: register['chrom'] = translation.get( register['chrom'], register['chrom']) descs = deepcopy(register['desc']) for desc in descs: register['desc'] = '|'.join(descs[desc]) if len(set(desc)) == 1: desc = desc[0] register['dst'] = ','.join(sorted(list(set(desc)))) vcf_holder.printRegister(register) else: #print '-' pass fhdv.flush() print "\nGLOBAL STATS" print 'Global Stats :', " ".join([ "{:s}: {:10,d}".format(*i) for i in sorted(vcf_holder.stats.items()) ]) os.rename(outFileTmp, outFile) os.utime(listFile, None) if not os.path.exists(vcfFile): os.symlink(invcf, vcfFile) os.utime(vcfFile, None) os.utime(outFile, None)
def main(): try: infile = os.sys.argv[1] except: print "no input file given" print sys.argv[0], "<INPUT MULTICOLUMN CSV>" sys.exit(1) checkfile(infile) print "splitting %s" % infile defs = [] names = [] outfiles = [] valid = 0 skipped = 0 lastCol = "" num_cols = None line_count = 0 with openfile(infile, 'r') as fhd: for line in fhd: line = line.strip() if len(line) == 0: continue if line.startswith("#"): # header print "HEADER", line if line.startswith("##"): # definition lines print "HEADER :: DEF", line defs.append(line) else: # column description print "HEADER :: COL", line cols = line.split("\t") num_cols = len(cols) shared = cols[: 9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMA names = cols[9:] # Project specific method, used to deal with sample names that contain "_" # May be changed later """ By default, you should make sure that sample name does not contain "_" or ".", and len(sample_name) + len(chromosome_name) <= 31. If that is not the case then sample name and/or chromosome name may have to be modified (like this method) """ newnames = [] for idx, name in enumerate(names): newname = name if "_" in newname: newname = newname.rpartition("_")[0].replace( "_", "") if "." in newname: newname = newname.partition(".")[0] newnames.append(newname) names = newnames print "HEADER :: COL :: SHARED", shared print "HEADER :: COL :: NAMES", names outfiles = [None] * len(names) outlist = open("%s.lst" % infile, 'w') for np, name in enumerate(names): nof = ("%s.%0" + str(len("%d" % len(names))) + "d.%s.vcf.gz") % (infile, np + 1, sanitize(name)) print("creating %" + str(len("%d" % len(names))) + "d %-" + str(max([len(x) for x in names])) + "s to %s") % (np + 1, name, nof) nop = openfile(nof, 'w') # skipped valid outfiles[np] = [name, nof, nop, 0, 0] outlist.write("1\t%s\t%s\n" % (os.path.abspath(nof), name)) nop.write("\n".join(defs) + "\n") nop.write("##Split from: %s column %d\n" % (os.path.abspath(infile), np + 1)) nop.write("\t".join(shared)) nop.write("\t%s\n" % name) nop.flush() continue line_count += 1 if line_count % 1000 == 0: sys.stdout.write('.') if line_count % 100000 == 0: sys.stdout.write(' lines %12d valid %12d skipped %12d\n' % (line_count, valid, skipped)) for nop, ndata in enumerate(outfiles): ndata[2].flush() sys.stdout.flush() #print "DATA", line cols = line.split("\t") assert len(cols) == num_cols ref = cols[3] alts = cols[4].split(',') if len(ref) > 1 or any([len(x) > 1 for x in alts]): # exclude MNP or indel # print "Excluded line: " + line continue shared = cols[: 9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMA shared_str = "\t".join(shared) data = cols[9:] #used to be " + "\t"" if cols[0] != lastCol: print '\nChromosome', cols[0] lastCol = cols[0] #print "shared", shared #print "data" , data for pos, ndata in enumerate(data): #outfiles[np] = [name, nof, 0, 0, nop] res = [ndata.startswith(x) for x in targets] if any(res): shared[4] = alts[res.index(True)] shared_str = "\t".join(shared) valid += 1 outfiles[pos][4] += 1 # valid else: skipped += 1 outfiles[pos][3] += 1 # skipped continue outfiles[pos][2].write(shared_str + "\t" + ndata + "\n") for nop, ndata in enumerate(outfiles): ndata[2].close() print("closing %" + str(len("%d" % len(outfiles))) + "d %-" + str(max([len(x[0]) for x in outfiles])) + "s :: %-" + str(max([len(x[1]) for x in outfiles])) + "s :: skipped %6d exported %6d total %7d") % ( nop + 1, ndata[0], ndata[1], ndata[3], ndata[4], ndata[3] + ndata[4])
def main(): try: infile = os.sys.argv[1] except: print "no input file given" print sys.argv[0], "<INPUT MULTICOLUMN CSV>" sys.exit(1) checkfile(infile) print "splitting %s" % infile defs = [] names = [] outfiles = [] valid = 0 skipped = 0 lastCol = "" num_cols = None line_count = 0 with openfile(infile, 'r') as fhd: for line in fhd: line = line.strip() if len(line) == 0: continue if line.startswith("#"): # header print "HEADER", line if line.startswith("##"): # definition lines print "HEADER :: DEF", line defs.append(line) else: # column description print "HEADER :: COL", line cols = line.split("\t") num_cols = len(cols) shared = cols[: 9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMA names = cols[9:] print "HEADER :: COL :: SHARED", shared print "HEADER :: COL :: NAMES", names outfiles = [None] * len(names) outlist = open("%s.lst" % infile, 'w') for np, name in enumerate(names): nof = ("%s_%0" + str(len("%d" % len(names))) + "d_%s.vcf.gz") % (infile, np + 1, sanitize(name)) print("creating %" + str(len("%d" % len(names))) + "d %-" + str(max([len(x) for x in names])) + "s to %s") % (np + 1, name, nof) nop = openfile(nof, 'w') # skipped valid outfiles[np] = [name, nof, nop, 0, 0] outlist.write("1\t%s\t%s\n" % (os.path.abspath(nof), name)) nop.write("\n".join(defs) + "\n") nop.write("##Split from: %s column %d\n" % (os.path.abspath(infile), np + 1)) nop.write("\t".join(shared)) nop.write("\t%s\n" % name) nop.flush() continue line_count += 1 if line_count % 1000 == 0: sys.stdout.write('.') if line_count % 100000 == 0: sys.stdout.write(' lines %12d valid %12d skipped %12d\n' % (line_count, valid, skipped)) for nop, ndata in enumerate(outfiles): ndata[2].flush() sys.stdout.flush() #print "DATA", line cols = line.split("\t") assert len(cols) == num_cols shared = cols[: 9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMA shared_str = "\t".join(shared) + "\t" data = cols[9:] if cols[0] != lastCol: print '\nChromosome', cols[0] lastCol = cols[0] #print "shared", shared #print "data" , data for pos, ndata in enumerate(data): #outfiles[np] = [name, nof, 0, 0, nop] if any([ndata.startswith(x) for x in ignores]): skipped += 1 outfiles[pos][3] += 1 # skipped continue valid += 1 outfiles[pos][4] += 1 # valid outfiles[pos][2].write(shared_str + "\t" + ndata + "\n") for nop, ndata in enumerate(outfiles): ndata[2].close() print("closing %" + str(len("%d" % len(outfiles))) + "d %-" + str(max([len(x[0]) for x in outfiles])) + "s :: %-" + str(max([len(x[1]) for x in outfiles])) + "s :: skipped %6d exported %6d total %7d") % ( nop + 1, ndata[0], ndata[1], ndata[3], ndata[4], ndata[3] + ndata[4])
def main(): try: infile = os.sys.argv[1] except: print "no input file given" print sys.argv[0], "<INPUT MULTICOLUMN CSV>" sys.exit(1) checkfile(infile) print "splitting %s" % infile defs = [] names = [] outfiles = [] valid = 0 skipped = 0 lastCol = "" num_cols = None line_count = 0 with openfile(infile, 'r') as fhd: for line in fhd: line = line.strip() if len(line) == 0: continue if line.startswith("#"): # header print "HEADER", line if line.startswith("##"): # definition lines print "HEADER :: DEF", line defs.append( line ) else: # column description print "HEADER :: COL", line cols = line.split("\t") num_cols = len(cols) shared = cols[:9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMA names = cols[9:] print "HEADER :: COL :: SHARED", shared print "HEADER :: COL :: NAMES" , names outfiles = [None]*len(names) outlist = open("%s.lst" % infile, 'w') for np, name in enumerate(names): nof = ("%s_%0"+str(len("%d"%len(names)))+"d_%s.vcf.gz") % (infile, np+1, sanitize(name)) print ("creating %"+str(len("%d"%len(names)))+"d %-"+str(max([len(x) for x in names]))+"s to %s") % (np+1, name, nof) nop = openfile( nof, 'w' ) # skipped valid outfiles[np] = [name, nof, nop, 0 , 0] outlist.write("1\t%s\t%s\n" % (os.path.abspath(nof), name)) nop.write("\n".join(defs) + "\n") nop.write("##Split from: %s column %d\n" % ( os.path.abspath(infile), np + 1) ) nop.write("\t".join(shared)) nop.write("\t%s\n" % name) nop.flush() continue line_count += 1 if line_count % 1000 == 0: sys.stdout.write('.') if line_count % 100000 == 0: sys.stdout.write(' lines %12d valid %12d skipped %12d\n' % (line_count, valid, skipped) ) for nop, ndata in enumerate(outfiles): ndata[2].flush() sys.stdout.flush() #print "DATA", line cols = line.split("\t") assert len(cols) == num_cols shared = cols[:9] #CHROM POS ID REF ALT QUAL FILTER INFO FORMA shared_str = "\t".join(shared) + "\t" data = cols[9:] if cols[0] != lastCol: print '\nChromosome', cols[0] lastCol = cols[0] #print "shared", shared #print "data" , data for pos, ndata in enumerate(data): #outfiles[np] = [name, nof, 0, 0, nop] if any([ndata.startswith(x) for x in ignores]): skipped += 1 outfiles[pos][3] += 1 # skipped continue valid += 1 outfiles[pos][4] += 1 # valid outfiles[pos][2].write(shared_str + "\t" + ndata + "\n") for nop, ndata in enumerate(outfiles): ndata[2].close() print ("closing %"+str(len("%d"%len(outfiles)))+"d %-"+str(max([len(x[0]) for x in outfiles]))+"s :: %-"+str(max([len(x[1]) for x in outfiles]))+"s :: skipped %6d exported %6d total %7d") % (nop+1, ndata[0], ndata[1], ndata[3], ndata[4], ndata[3] + ndata[4])
def main(incsv, translation_str): outfile = incsv + '.vcf.gz' if not os.path.exists(incsv): print "input file does not exists. quitting like a whimp" sys.exit(1) print "reading %s" % incsv translation = {} if translation_str is not None: for pair in translation_str.split(';'): src, dst = pair.split(':') assert src not in translation translation[src] = dst print "Translation", translation if os.path.exists(outfile): print "output file %s exists. quitting like a whimp" % outfile sys.exit(1) print "saving to %s" % outfile data = vcfHeap(translation=translation) cfh = openfile(incsv, 'r') for line in cfh: if line[0] == "#": continue line = line.strip() cols = line.split('\t') assert len(cols) >= 2 print cols, cols[:3] data.addFile(*cols[:3]) mfh = openvcffile(outfile + '.tmp.vcf.gz', 'w', compresslevel=1) mfh.write(data.getVcfHeader()) mfh.flush() num_lines = 0 print_every = 1000 lines = [] while not data.isempty(): val = data.next() if val is not None: # if not empty lines.append(str(val)) if len(lines) % (print_every / 100) == 0: sys.stdout.write('.') #sys.stdout.write(' {:14,d}\n'.format(len(lines))) #break if len(lines) % print_every == 0: num_lines += len(lines) sys.stdout.write(' {:14,d}\n'.format(num_lines)) #break mfh.write("".join(lines)) mfh.flush() lines = [] sys.stdout.flush() else: print "val is empty" break mfh.write("".join(lines)) mfh.flush() mfh.close() num_lines += len(lines) lines = [] sys.stdout.write('\nTotal {:14,d}\n'.format(num_lines)) os.rename(outfile + '.tmp.vcf.gz', outfile) print "Finished" return outfile