def mstmap(args): """ %prog mstmap LMD50.snps.genotype.txt Convert LMDs to MSTMAP input. """ from jcvi.assembly.geneticmap import MSTMatrix p = OptionParser(mstmap.__doc__) p.add_option("--population_type", default="RIL6", help="Type of population, possible values are DH and RILd") p.add_option("--missing_threshold", default=.5, help="Missing threshold, .25 excludes any marker with >25% missing") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) lmd, = args fp = open(lmd) fp.next() # Header table = {"0": "-", "1": "A", "2": "B", "3": "X"} mh = ["locus_name"] + fp.next().split()[4:] genotypes = [] for row in fp: atoms = row.split() chr, pos, ref, alt = atoms[:4] locus_name = ".".join((chr, pos)) codes = [table[x] for x in atoms[4:]] genotypes.append([locus_name] + codes) mm = MSTMatrix(genotypes, mh, opts.population_type, opts.missing_threshold) mm.write(opts.outfile, header=True)
def mstmap(args): """ %prog mstmap bcffile/vcffile > matrixfile Convert bcf/vcf format to mstmap input. """ from jcvi.assembly.geneticmap import MSTMatrix p = OptionParser(mstmap.__doc__) p.add_option( "--dh", default=False, action="store_true", help="Double haploid population, no het [default: %default]", ) p.add_option( "--freq", default=0.2, type="float", help="Allele must be above frequency [default: %default]", ) p.add_option( "--mindepth", default=3, type="int", help="Only trust genotype calls with depth [default: %default]", ) p.add_option( "--missing_threshold", default=0.25, type="float", help="Fraction missing must be below", ) p.add_option( "--noheader", default=False, action="store_true", help="Do not print MSTmap run parameters [default: %default]", ) p.add_option( "--pv4", default=False, action="store_true", help="Enable filtering strand-bias, tail distance bias, etc. " "[default: %default]", ) p.add_option( "--freebayes", default=False, action="store_true", help="VCF output from freebayes", ) p.set_sep(sep=".", help="Use separator to simplify individual names") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (vcffile, ) = args if vcffile.endswith(".bcf"): bcffile = vcffile vcffile = bcffile.rsplit(".", 1)[0] + ".vcf" cmd = "bcftools view {0}".format(bcffile) cmd += " | vcfutils.pl varFilter" if not opts.pv4: cmd += " -1 0 -2 0 -3 0 -4 0 -e 0" if need_update(bcffile, vcffile): sh(cmd, outfile=vcffile) freq = opts.freq sep = opts.sep depth_index = 1 if opts.freebayes else 2 ptype = "DH" if opts.dh else "RIL6" nohet = ptype == "DH" fp = open(vcffile) genotypes = [] for row in fp: if row[:2] == "##": continue atoms = row.split() if row[0] == "#": ind = [x.split(sep)[0] for x in atoms[9:]] nind = len(ind) mh = ["locus_name"] + ind continue marker = "{0}.{1}".format(*atoms[:2]) geno = atoms[9:] geno = [ encode_genotype(x, mindepth=opts.mindepth, depth_index=depth_index, nohet=nohet) for x in geno ] assert len(geno) == nind f = 1.0 / nind if geno.count("A") * f < freq: continue if geno.count("B") * f < freq: continue if geno.count("-") * f > opts.missing_threshold: continue genotype = [marker] + geno genotypes.append(genotype) mm = MSTMatrix(genotypes, mh, ptype, opts.missing_threshold) mm.write(opts.outfile, header=(not opts.noheader))