def main(arguments=sys.argv[1:]): """Main method for mvf_filter""" parser = argparse.ArgumentParser(description=""" Filters and Transforms MVF files""") parser.add_argument("--mvf", help="input MVF file") parser.add_argument("--out", help="output MVF file") parser.add_argument("--actions", nargs='*', help=("set of actions:args to perform," " note these are done in order as listed")) parser.add_argument("--test", help="manually input a line for testing") parser.add_argument("--testnchar", type=int, help="total number of samples for test string") parser.add_argument("--modulehelp", action="store_true", help="prints full module list and descriptions") parser.add_argument("--linebuffer", type=int, default=100000, help="number of lines to write at once to MVF") parser.add_argument("--verbose", action="store_true", help="report every line (for debugging)") parser.add_argument("--overwrite", action="store_true", help="USE WITH CAUTION: force overwrite of outputs") parser.add_argument("--quiet", action="store_true", help="suppress progress meter") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-02-26") sys.exit() args = parser.parse_args(args=arguments) time0 = time() if args.modulehelp: modulehelp() if not args.mvf and not args.test: raise RuntimeError("No input file specified with --mvf") if not args.out and not args.test: raise RuntimeError("No output file specified with --outs") if not args.actions: raise RuntimeError("No --actions specified!") ## Establish Input MVF if args.test: ncol = args.testnchar or len(args.test) else: mvf = MultiVariantFile(args.mvf, 'read') ncol = mvf.metadata['ncol'] ## Create Actionset actionset = build_actionset(args.actions, ncol) ##TESTING MODE if args.test: loc, alleles = args.test.split() linefail = False transformed = False #invar = invariant (single character) #refvar (all different than reference, two chars) #onecov (single coverage, + is second character) #onevar (one variable base, + is third character) #full = full alleles (all chars) if args.verbose: print(alleles) linetype = get_linetype(alleles) sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype)) for actionname, actiontype, actionfunc, actionarg in actionset: sys.stdout.write("Applying action {} ({}): ".format( actionname, actiontype)) if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True sys.stdout.write("Filter Fail\n") break else: sys.stdout.write("Filter Pass\n") elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True sys.stdout.write("Transform removed all alleles\n") break else: sys.stdout.write("Transform result {}\n".format(alleles)) elif actiontype == 'location': if not actionfunc([int(x) for x in loc.split(':')]): linefail = True sys.stdout.write("Location Fail\n") break else: sys.stdout.write("Location Pass\n") if not linefail: if transformed: if linetype == 'full': alleles = encode_mvfstring(alleles) if alleles: test_output = "{}\t{}\n".format(loc, alleles) sys.stdout.write("Final output = {}\n".format( test_output)) else: sys.stdout.write("Transform removed all alleles\n") else: sys.stdout.write("No changes applied\n") sys.stdout.write("Final output = {}\n".format(args.test)) sys.exit() ## MAIN MODE ## Set up file handler outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.metadata = deepcopy(mvf.metadata) ### reprocess header if actions are used that filter columns if any(x == y[0] for x in ('columns', 'collapsepriority') for y in actionset): labels = outmvf.metadata['labels'][:] for actionname, actiontype, actionfunc, actionarg in actionset: if actionname == 'columns': labels = [labels[x] for x in actionarg] elif actionname == 'collapsepriority': labels = [labels[x] for x in xrange(len(labels)) if x not in actionarg[1:]] oldindicies = mvf.get_sample_indices(labels) newsamples = {} for i, _ in enumerate(labels): newsamples[i] = mvf.metadata['samples'][oldindicies[i]] outmvf.metadata['samples'] = newsamples.copy() outmvf.metadata['labels'] = labels[:] outmvf.write_data(outmvf.get_header()) ## End header editing linebuffer = [] nbuffer = 0 for chrom, pos, allelesets in mvf.iterentries(decode=False): linefail = False transformed = False #invar = invariant (single character) #refvar (all different than reference, two chars) #onecov (single coverage, + is second character) #onevar (one variable base, + is third character) #full = full alleles (all chars) alleles = allelesets[0] linetype = get_linetype(alleles) if linetype == 'empty': continue if args.verbose: sys.stdout.write(" {} {}".format(alleles, linetype)) for actionname, actiontype, actionfunc, actionarg in actionset: if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True elif actiontype == 'location': if not actionfunc([chrom, pos]): linefail = True if linefail: break if not linefail: if transformed: if linetype == 'full': alleles = mvf.encode(alleles) if not alleles: linefail = True if not linefail: nbuffer += 1 linebuffer.append((chrom, pos, (alleles,))) if args.verbose: sys.stdout.write("{}\n".format(alleles)) if nbuffer == args.linebuffer: outmvf.write_entries(linebuffer) linebuffer = [] nbuffer = 0 elif args.verbose: sys.stdout.write("FAIL\n") if linebuffer: outmvf.write_entries(linebuffer) linebuffer = [] if not args.quiet: print("Completed in {} seconds".format(time() - time0)) return ''
def main(arguments=sys.argv[1:]): """Main method for fasta2mvf""" parser = argparse.ArgumentParser(description=""" Converts multisample-FASTA to MVF file with filtering """) parser.add_argument("--fasta", help="input FASTA file", required=True) parser.add_argument("--out", help="output MVF file", required=True) parser.add_argument("--contigids", nargs='*', help=("""manually specify one or more contig ids as ID:NAME""")) parser.add_argument("--samplereplace", nargs="*", help="""one or more TAG:NEWLABEL or TAG, items, if TAG found in sample label, replace with NEW (or TAG if NEW not specified) NEW and TAG must each be unique""") parser.add_argument("--reflabel", default="REF", help="label for reference sample (default='REF')") parser.add_argument("--allelesfrom", default=None, help="""get additional alignment columns from INFO fields (:-separated)""") parser.add_argument("--readbuffer", type=int, default=100000, help="number of lines to hold in READ buffer") parser.add_argument("--writebuffer", type=int, default=100000, help="number of lines to hold in WRITE buffer") parser.add_argument("--fieldsep", default="NONE", choices=['TAB', 'SPACE', 'DBLSPACE', 'COMMA', 'MIXED', 'PIPE'], help="""FASTA field separator; assumes '>database/SEP/accession/SEP/locus' format (default='NONE')""") parser.add_argument("--contigfield", type=int, help="""when headers are split by --fieldsep, the 0-based index of the contig id""") parser.add_argument("--samplefield", type=int, help="""when headers are split by --fieldsep, the 0-based index of the sample id""") parser.add_argument("--overwrite", action="store_true", help="USE WITH CAUTION: force overwrite of outputs") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-07-07") sys.exit() sepchars = dict([("PIPE", "|"), ("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", " "), ("COMMA", ","), ("NONE", None)]) args.fieldsep = sepchars[args.fieldsep] mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) fasta = {} current_contig = 0 fsamples = [] fcontigs = [] for header, seq in fasta_iter(args.fasta): header = header.split(args.fieldsep) if (len(header) < max(3, args.contigfield or 0, args.samplefield or 0) or args.contigfield is None or args.samplefield is None): contig = "UNK{}".format(current_contig) sample = header[0] else: contig = header[args.contigfield] sample = header[args.samplefield] if contig not in fcontigs: fcontigs.append(contig) fasta[contig] = {} if sample not in fsamples: fsamples.append(sample) fasta[contig][sample] = (len(seq), seq) reflabel = None if args.reflabel: for i, samplename in enumerate(fsamples): if args.reflabel in samplename: reflabel = i break if reflabel: newref = fsamples.pop(i) fsamples = [newref] + fsamples for i, contig in enumerate(fcontigs): mvf.metadata['contigs'][i] = { 'label': contig, 'length': max([fasta[contig][x][0] for x in fasta[contig]])} mvf.metadata['labels'] = fsamples[:] for i, label in enumerate(fsamples[:]): mvf.metadata['samples'][i] = {'label': label} mvf.metadata['ncol'] = len(mvf.metadata['labels']) mvf.metadata['sourceformat'] = 'fasta' # WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 mvf_alleles = {} for cind, contig in enumerate(fcontigs): for pos in range(mvf.metadata['contigs'][cind]['length']): print(''.join(pos > fasta[contig][samp][0] and '-' or fasta[contig][samp][1][pos] for samp in fsamples)) mvf_alleles = encode_mvfstring( ''.join(pos > fasta[contig][samp][0] and '-' or fasta[contig][samp][1][pos] for samp in fsamples)) if mvf_alleles: mvfentries.append( (cind, pos+1, (mvf_alleles,))) nentry += 1 if nentry == args.writebuffer: mvf.write_entries(mvfentries, encoded=True) print(mvfentries[:5]) mvfentries = [] nentry = 0 if mvfentries: mvf.write_entries(mvfentries) mvfentries = [] return ''
def main(arguments=sys.argv[1:]): """Main method for vcf2mvf""" parser = argparse.ArgumentParser( description=""" Converts multisample-VCF to MVF file with filtering """ ) parser.add_argument("--vcf", help="input VCF file", required=True) parser.add_argument("--out", help="output MVF file", required=True) parser.add_argument("--maskdepth", type=int, default=1, help="below this depth mask with N/n") parser.add_argument( "--lowdepth", type=int, default=3, help="""below this depth convert to lower case set to 0 to disable""", ) parser.add_argument( "--maskqual", type=int, default=3, help="""low quality cutoff, bases replaced by N/- set to 0 to disable""", ) parser.add_argument( "--lowqual", type=int, default=20, help="""below this quality convert to lower case set to 0 to disable""", ) parser.add_argument( "--contigids", nargs="*", help=( """manually specify one or more contig ids as ID:NAME""" ), ) parser.add_argument( "--samplereplace", nargs="*", help="""one or more TAG:NEWLABEL or TAG, items, if TAG found in sample label, replace with NEW (or TAG if NEW not specified) NEW and TAG must each be unique""", ) parser.add_argument("--reflabel", default="REF", help="label for reference sample (default='REF')") parser.add_argument( "--allelesfrom", default=None, help="""get additional alignment columns from INFO fields (:-separated)""", ) parser.add_argument("--linebuffer", type=int, default=100000, help="number of lines to hold in read/write buffer") parser.add_argument("--no_autoindex", action="store_true", help="do not automatically index contigs from the VCF") parser.add_argument( "--fieldsep", default="TAB", choices=["TAB", "SPACE", "DBLSPACE", "COMMA", "MIXED"], help="""VCF field separator (default='TAB')""", ) parser.add_argument("--overwrite", action="store_true", help="USE WITH CAUTION: force overwrite of outputs") parser.add_argument("--quiet", action="store_true", help="suppress progress meter") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-02-26") sys.exit() sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", " "), ("COMMA", ","), ("MIXED", None)]) args.fieldsep = sepchars[args.fieldsep] ## ESTABLISH VCF vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex)) ## ESTABLISH MVF mvf = MultiVariantFile(args.out, "write", overwrite=args.overwrite) # PROCESS CONTIG INFO contigs = vcf.metadata["contigs"].copy() maxcontigid = 0 newids = set([]) if args.contigids: for cid, cname in (x.split(":") for x in args.contigids): for tempid in contigs: if cname in contigs[tempid]["label"]: try: cid = int(cid) except ValueError: pass mvf.metadata["contigs"][cid] = contigs[tempid].copy() del contigs[tempid] newids.update([cid]) break for cid in newids: try: maxcontigid = max([maxcontigid, int(cid) + 1]) except ValueError: continue tempids = set(contigs.keys()) - newids for tempid, newid in sorted(zip(tempids, xrange(maxcontigid, maxcontigid + len(tempids)))): mvf.metadata["contigs"][newid] = vcf.metadata["contigs"][tempid] contig_translate = dict([(mvf.metadata["contigs"][x]["label"], x) for x in mvf.metadata["contigs"]]) # PROCESS SAMPLE INFO samplelabels = [args.reflabel] + vcf.metadata["samples"][:] if args.allelesfrom: args.allelesfrom = args.allelesfrom.split(":") samplelabels += args.allelesfrom if args.samplereplace: newsample = [":" in tuple(x) and x.split(":") or tuple([x, x]) for x in args.samplereplace] unmatched = [x for x in enumerate(samplelabels)] for old, new in newsample: labelmatched = False for j, (i, name) in enumerate(unmatched): if old in name: samplelabels[i] = new labelmatched = j break if labelmatched != False: del unmatched[labelmatched] mvf.metadata["labels"] = samplelabels[:] for i, label in enumerate(samplelabels): mvf.metadata["samples"][i] = {"label": label} mvf.metadata["ncol"] = len(mvf.metadata["labels"]) mvf.metadata["sourceformat"] = vcf.metadata["sourceformat"] ## WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 for vcfrecord in vcf.iterentries(vars(args)): mvf_alleles = encode_mvfstring("".join(vcfrecord["genotypes"])) if mvf_alleles: mvfentries.append( (contig_translate.get(vcfrecord["contig"], vcfrecord["contig"]), vcfrecord["coord"], (mvf_alleles,)) ) nentry += 1 if nentry == args.linebuffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 if mvfentries: mvf.write_entries(mvfentries) mvfentries = [] return ""
'ATTTTTTTTT', 'A---------', 'ATCCCCCCCC', 'A-CCCCCCCC', 'ATGCCCCCCC', 'AGCGGGGGGG', 'AT--------', 'A-T-------', 'A--T------', ] NRAND = 100000 RANDOM_STRINGS = [''.join(choices("ATGCX-", k=10)) for _ in range(NRAND)] for x in TEST_STRINGS: print(x) y = encode_mvfstring(x) print(y) z = decode_mvfstring(y, NCOL) print(z) print(x == z) print("==========") RANDOM_PASS = 0 for x in RANDOM_STRINGS: #print(x) y = encode_mvfstring(x) #print(y) z = decode_mvfstring(y, NCOL) #print(z) #print(x == z) #print("==========")
def main(arguments=sys.argv[1:]): """Main method for geno2mvf""" parser = argparse.ArgumentParser(description=""" Converts GATK Genotype Format to MVF file with some filters """) parser.add_argument("--geno", help="input .geno file", required=True) parser.add_argument("--out", help="output MVF file", required=True) parser.add_argument("--contigids", nargs='*', help=("manually specify one or more contig ids" " as ID:NAME")) parser.add_argument("--samplereplace", nargs="*", help="""one or more TAG:NEWLABEL or TAG, items, if TAG found in sample label, replace with NEW (or TAG if NEW not specified) NEW and TAG must each be unique""") parser.add_argument("--reflabel", default="REF", help="""label of the reference sample (default is first entry)""") parser.add_argument("--no_autoindex", action="store_true", help="do not automatically index contigs") parser.add_argument("--fieldsep", default="SPACE", choices=['TAB', 'SPACE', 'DBLSPACE', 'COMMA', 'MIXED'], help="""entry field separator (default='SPACE')""") parser.add_argument("--linebuffer", type=int, default=100000, help="number of lines to hold in read/write buffer") parser.add_argument("--overwrite", action="store_true", help="USE WITH CAUTION: force overwrite of outputs") parser.add_argument("--quiet", action="store_true", help="suppress progress meter") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-02-01: Initial Public Release") sys.exit() sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", " "), ("COMMA", ","), ("MIXED", None)]) args.fieldsep = sepchars[args.fieldsep] ## ESTABLISH GENO geno = GenoFile(args.geno, indexcontigs=(not args.no_autoindex)) ## ESTABLISH MVF mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # PROCESS CONTIG INFO contigs = geno.metadata['contigs'].copy() maxcontigid = 0 newids = set([]) if args.contigids: for cid, cname in (x.split(':') for x in args.contigids): for tempid in contigs: if cname in contigs[tempid]['label']: try: cid = int(cid) except ValueError: pass mvf.metadata['contigs'][cid] = contigs[tempid].copy() del contigs[tempid] newids.update([cid]) break for cid in newids: try: maxcontigid = max([maxcontigid, int(cid) + 1]) except ValueError: continue tempids = set(contigs.keys()) - newids for tempid, newid in sorted(zip( tempids, xrange(maxcontigid, maxcontigid + len(tempids)))): mvf.metadata['contigs'][newid] = geno.metadata['contigs'][tempid] contig_translate = dict([(mvf.metadata['contigs'][x]['label'], x) for x in mvf.metadata['contigs']]) # PROCESS SAMPLE INFO samplelabels = geno.metadata['samples'][:] if args.samplereplace: newsample = [':' in tuple(x) and x.split(':') or tuple([x, x]) for x in args.samplereplace] unmatched = [x for x in enumerate(samplelabels)] for old, new in newsample: labelmatched = False for j, (i, name) in enumerate(unmatched): if old in name: samplelabels[i] = new labelmatched = j break if labelmatched != False: del unmatched[labelmatched] mvf.metadata['labels'] = samplelabels[:] for i, label in enumerate(samplelabels): mvf.metadata['samples'][i] = {'label': label} mvf.metadata['ncol'] = len(mvf.metadata['labels']) mvf.metadata['sourceformat'] = geno.metadata['sourceformat'] ## WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 for record in geno.iterentries(vars(args)): mvf_alleles = encode_mvfstring(''.join(record['genotypes'])) if mvf_alleles: mvfentries.append( (contig_translate.get(record['contig'], record['contig']), record['coord'], mvf_alleles)) nentry += 1 if nentry == args.linebuffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 if mvfentries: mvf.write_entries(mvfentries) mvfentries = [] return ''
def main(arguments=sys.argv[1:]): """Main method for maf2mvf""" parser = argparse.ArgumentParser(description=""" Converts Multiple Alignment Files to MVF file with some filters """) parser.add_argument("--maf", help="input MAF file") parser.add_argument("--out", help="output MVF file") parser.add_argument("--reftag", help="old reference tag") parser.add_argument("--mvfreflabel", default="REF", help="new label for reference sample (default='REF')") parser.add_argument("--contigids", nargs='*', help=("manually specify one or more contig ids" " as ID:NAME")) parser.add_argument("--sampletags", nargs="*", help="""one or more TAG:NEWLABEL or TAG, items, if TAG found in sample label, replace with NEW (or TAG if NEW not specified) NEW and TAG must each be unique""") parser.add_argument("--linebuffer", type=int, default=100000, help="number of lines to hold in read/write buffer") parser.add_argument("--overwrite", action="store_true") args = parser.parse_args(args=arguments) ## ESTABLISH MAF maf = MultiAlignFile(args) ## ESTABLISH MVF mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) # PROCESS CONTIG INFO # contigs = dict.fromkeys((sorted([x for x in maf.meta['name_index'] # if x.find(args.reftag) > -1])), {}) # print(contigs) # maxcontigid = 0 # newids = set([]) # if args.contigids: # for cid, cname in (x.split(':') for x in args.contigids): # for tempid in contigs: # if cname in contigs[tempid]['label']: # try: # cid = int(cid) # except ValueError: # pass # mvf.metadata['contigs'][cid] = contigs[tempid].copy() # del contigs[tempid] # newids.update([cid]) # break # for cid in newids: # try: # maxcontigid = max([maxcontigid, int(cid) + 1]) # except ValueError: # continue # tempids = set(contigs.keys()) - newids # for tempid, newid in zip( # tempids, xrange(maxcontigid, maxcontigid + len(tempids))): # # mvf.metadata['contigs'][newid] = maf.meta['contigs'][tempid] # pass # contig_translate = dict([(mvf.metadata['contigs'][x]['label'], x) # for x in mvf.metadata['contigs']]) # PROCESS SAMPLE INFO contig_translate = {1: 1} samplelabels = [s.split(':')[0] for s in args.sampletags] samplelabels.remove(args.reftag) samplelabels.insert(0, args.reftag) # if args.samplereplace: # newsample = [':' in tuple(x) and x.split(':') or tuple([x,x]) # for x in args.samplereplace] mvf.metadata['labels'] = samplelabels[:] for i, label in enumerate(samplelabels): mvf.metadata['samples'][i] = {'label': label} mvf.metadata['ncol'] = len(mvf.metadata['labels']) mvf.metadata['sourceformat'] = maf.metadata['sourceformat'] ## WRITE MVF HEADER mvf.write_data(mvf.get_header()) mvfentries = [] nentry = 0 for pos, length, msa in maf: for s in samplelabels: if s not in msa: msa[s] = '-'*length msa['contig'] = 1 for i in range(length): mvf_alleles = encode_mvfstring( ''.join(msa[s][i].strip() for s in samplelabels)) if mvf_alleles: mvfentries.append( (contig_translate.get(msa['contig']), pos+i, (mvf_alleles,))) nentry += 1 if nentry == args.linebuffer: mvf.write_entries(mvfentries, encoded=True) mvfentries = [] nentry = 0 if mvfentries: mvf.write_entries(mvfentries) return ''