def main(arguments=sys.argv[1:]): """Main method for mvf_filter""" parser = argparse.ArgumentParser(description=""" Filters and Transforms MVF files""") parser.add_argument("--mvf", help="input MVF file") parser.add_argument("--out", help="output MVF file") parser.add_argument("--actions", nargs='*', help=("set of actions:args to perform," " note these are done in order as listed")) parser.add_argument("--test", help="manually input a line for testing") parser.add_argument("--testnchar", type=int, help="total number of samples for test string") parser.add_argument("--modulehelp", action="store_true", help="prints full module list and descriptions") parser.add_argument("--linebuffer", type=int, default=100000, help="number of lines to write at once to MVF") parser.add_argument("--verbose", action="store_true", help="report every line (for debugging)") parser.add_argument("--overwrite", action="store_true", help="USE WITH CAUTION: force overwrite of outputs") parser.add_argument("--quiet", action="store_true", help="suppress progress meter") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-02-26") sys.exit() args = parser.parse_args(args=arguments) time0 = time() if args.modulehelp: modulehelp() if not args.mvf and not args.test: raise RuntimeError("No input file specified with --mvf") if not args.out and not args.test: raise RuntimeError("No output file specified with --outs") if not args.actions: raise RuntimeError("No --actions specified!") ## Establish Input MVF if args.test: ncol = args.testnchar or len(args.test) else: mvf = MultiVariantFile(args.mvf, 'read') ncol = mvf.metadata['ncol'] ## Create Actionset actionset = build_actionset(args.actions, ncol) ##TESTING MODE if args.test: loc, alleles = args.test.split() linefail = False transformed = False #invar = invariant (single character) #refvar (all different than reference, two chars) #onecov (single coverage, + is second character) #onevar (one variable base, + is third character) #full = full alleles (all chars) if args.verbose: print(alleles) linetype = get_linetype(alleles) sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype)) for actionname, actiontype, actionfunc, actionarg in actionset: sys.stdout.write("Applying action {} ({}): ".format( actionname, actiontype)) if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True sys.stdout.write("Filter Fail\n") break else: sys.stdout.write("Filter Pass\n") elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True sys.stdout.write("Transform removed all alleles\n") break else: sys.stdout.write("Transform result {}\n".format(alleles)) elif actiontype == 'location': if not actionfunc([int(x) for x in loc.split(':')]): linefail = True sys.stdout.write("Location Fail\n") break else: sys.stdout.write("Location Pass\n") if not linefail: if transformed: if linetype == 'full': alleles = encode_mvfstring(alleles) if alleles: test_output = "{}\t{}\n".format(loc, alleles) sys.stdout.write("Final output = {}\n".format( test_output)) else: sys.stdout.write("Transform removed all alleles\n") else: sys.stdout.write("No changes applied\n") sys.stdout.write("Final output = {}\n".format(args.test)) sys.exit() ## MAIN MODE ## Set up file handler outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.metadata = deepcopy(mvf.metadata) ### reprocess header if actions are used that filter columns if any(x == y[0] for x in ('columns', 'collapsepriority') for y in actionset): labels = outmvf.metadata['labels'][:] for actionname, actiontype, actionfunc, actionarg in actionset: if actionname == 'columns': labels = [labels[x] for x in actionarg] elif actionname == 'collapsepriority': labels = [labels[x] for x in xrange(len(labels)) if x not in actionarg[1:]] oldindicies = mvf.get_sample_indices(labels) newsamples = {} for i, _ in enumerate(labels): newsamples[i] = mvf.metadata['samples'][oldindicies[i]] outmvf.metadata['samples'] = newsamples.copy() outmvf.metadata['labels'] = labels[:] outmvf.write_data(outmvf.get_header()) ## End header editing linebuffer = [] nbuffer = 0 for chrom, pos, allelesets in mvf.iterentries(decode=False): linefail = False transformed = False #invar = invariant (single character) #refvar (all different than reference, two chars) #onecov (single coverage, + is second character) #onevar (one variable base, + is third character) #full = full alleles (all chars) alleles = allelesets[0] linetype = get_linetype(alleles) if linetype == 'empty': continue if args.verbose: sys.stdout.write(" {} {}".format(alleles, linetype)) for actionname, actiontype, actionfunc, actionarg in actionset: if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True elif actiontype == 'location': if not actionfunc([chrom, pos]): linefail = True if linefail: break if not linefail: if transformed: if linetype == 'full': alleles = mvf.encode(alleles) if not alleles: linefail = True if not linefail: nbuffer += 1 linebuffer.append((chrom, pos, (alleles,))) if args.verbose: sys.stdout.write("{}\n".format(alleles)) if nbuffer == args.linebuffer: outmvf.write_entries(linebuffer) linebuffer = [] nbuffer = 0 elif args.verbose: sys.stdout.write("FAIL\n") if linebuffer: outmvf.write_entries(linebuffer) linebuffer = [] if not args.quiet: print("Completed in {} seconds".format(time() - time0)) return ''
def main(arguments=sys.argv[1:]): """Main method for mvf_join""" parser = argparse.ArgumentParser(description=""" MVF joining both veritically (separate contigs) and and horizontally (different samples)""") parser.add_argument("mvf", nargs="*", help="one or more mvf files") parser.add_argument("--out", help="output mvf file") parser.add_argument("--newcontigs", action="store_true", help="Don't match contigs using labels (not IDs)") parser.add_argument("--newsamples", action="store_true", help="Don't match samples using labels") parser.add_argument("--linebuffer", type=int, default=100000, help="number of entries to write in a block") parser.add_argument("--main_header_file", help="""name of MVF file to use the headers from (default=first in list)""") parser.add_argument("--overwrite", action="store_true", help="USE WITH CAUTION: force overwrite of outputs") parser.add_argument("--quiet", action="store_true", help="suppress progress meter") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-02-01: Initial Public Release") sys.exit() concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) ## Copy the first file's metadata if args.main_header_file: if args.main_header_file not in args.mvf: raise RuntimeError("{} not found in files".format( args.main_header_file)) else: args.main_header_file = args.mvf.index(args.main_header_file) else: args.main_header_file = 0 first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read') concatmvf.metadata = first_mvf.metadata.copy() ## Open each MVF file, read headers to make unified header transformers = [] for mvfname in args.mvf: ## This will create a dictionary of samples{old:new}, contigs{old:new} transformer = MvfTransformer() mvf = MultiVariantFile(mvfname, 'read') for i, label in enumerate(mvf.get_sample_labels()): if label not in concatmvf.get_sample_labels(): concatmvf.metadata['labels'].append(label) concatmvf.metadata['samples'][ concatmvf.metadata['labels'].index(label)] = { 'label': label} if concatmvf.metadata['labels'].index(label) != i: transformer.set_label( i, concatmvf.metadata['labels'].index(label)) for contigid, contigdata in mvf.metadata['contigs'].iteritems(): if contigdata['label'] not in [ concatmvf.metadata['contigs'][x]['label'] for x in concatmvf.metadata['contigs']]: newid = (contigid not in concatmvf.metadata['contigs'] and contigid or concatmvf.get_next_contig_id()) concatmvf.metadata['contigs'][newid] = contigdata else: for concatid, concatdata in ( concatmvf.metadata['contigs'].items()): if contigdata['label'] == concatdata['label']: newid = concatid break if newid != contigid: transformer.set_contig(contigid, newid) transformers.append(transformer) ## Write output header concatmvf.write_data(concatmvf.get_header()) ## Now loop through each file entries = [] nentries = 0 for ifile, mvfname in enumerate(args.mvf): if not args.quiet: sys.stderr.write("Processing {} ...\n".format(mvfname)) transformer = transformers[ifile] mvf = MultiVariantFile(mvfname, 'read') for contigid, pos, allelesets in mvf.iterentries(decode=False, quiet=args.quiet): if transformer.labels: allelesets = [mvf.decode(x) for x in allelesets] for j, alleles in enumerate(allelesets): allelesets[j] = concatmvf.encode(''.join([ x in transformer.labels and alleles[transformer.labels[x]] or alleles[x] for x in xrange(len(alleles))])) if transformer.contigs: contigid = (contigid in transformer['contigs'] and transformer['contigs'][contigid] or contigid) entries.append((contigid, pos, allelesets)) nentries += 1 if nentries == args.linebuffer: concatmvf.write_entries(entries) entries = [] nentries = 0 if entries: concatmvf.write_entries(entries) entries = [] nentries = 0 if not args.quiet: sys.stderr.write("done\n") return ''