def main(arguments=None): """Main method""" arguments = sys.argv[1:] if arguments is None else arguments parser = generate_argparser() args = parser.parse_args(args=arguments) mvf = MultiVariantFile(args.mvf, 'read') flavor = mvf.metadata['flavor'] if (flavor in ("dna", "rna") and args.outdata == "prot") or ( flavor == "prot" and args.outdata in ("dna", "rna")): raise RuntimeError( "--outdata {} incompatiable with '{}' flavor mvf".format( args.outdata, flavor)) sample_cols = mvf.get_sample_indices(args.samples or None) labels = mvf.get_sample_labels(sample_cols) current_contig = '' seqs = {} for contig, _, allelesets in mvf.iterentries(quiet=args.quiet, decode=True): if contig != current_contig: if seqs: with open( "{}.{}.fa".format( args.outprefix, mvf.metadata['contigs'][contig]['label']), 'wt') as outfile: for seqname in sorted(seqs): outfile.write(">{}\n{}\n".format( seqname, ''.join(seqs[seqname]))) seqs = None seqs = {} current_contig = contig[:] for col, label in zip(sample_cols, labels): if label not in seqs: seqs[label] = [] if flavor in ('dna', 'rna'): seqs[label].append(allelesets[0][col] == 'X' and 'N' or allelesets[0][col]) elif flavor in ('codon', 'prot') and (args.outdata == 'prot'): seqs[label].append(allelesets[0][col]) elif flavor == 'codon' and args.outdata == 'dna': seqs[label].extend([ allelesets[x][col] == 'X' and 'N' or allelesets[x][col] for x in (1, 2, 3) ]) if seqs: with open( "{}.{}.fa".format(args.outprefix, mvf.metadata['contigs'][contig]['label']), 'wt') as outfile: for seqname in sorted(seqs): outfile.write(">{}\n{}\n".format(seqname, ''.join(seqs[seqname]))) seqs = None seqs = {} return ''
def main(arguments=sys.argv[1:]): """Main MVF Treemaker""" parser = argparse.ArgumentParser( description=""" Process MVF into alignment""" ) parser.add_argument("--mvf", help="inputmvf") parser.add_argument("--out", help="tree list output file") parser.add_argument("--samples", nargs="*", help="one or more taxon labels, default=all") parser.add_argument("--raxml_outgroups", nargs="*", help="select outgroups to use in RAxML") parser.add_argument( "--rootwith", nargs="*", help="""root output trees with these taxa after RAxML""", ) parser.add_argument("--contigs", nargs="*", help="choose one or more contigs, default=all") parser.add_argument("--outputcontiglabels", action="store_true", help="output contig labels instead of ids") parser.add_argument("--outputempty", action="store_true", help="output entries of windows with no data") parser.add_argument( "--hapmode", default="none", choices=["none", "randomone", "randomboth", "major", "minor", "majorminor"], help="""haplotype splitting mode. 'none' = no splitting; 'randomone' = pick one allele randomly (recommended); 'randomboth = pick alleles randomly, keep both; 'major' = pick the more common allele; 'minor' = pick the less common allele; 'majorminor' = put the major in 'a' and minor in 'b' """, ) parser.add_argument( "--windowsize", type=int, default=10000, help="""specify genomic region size, or use -1 for whole contig""", ) parser.add_argument("--minsites", type=int, default=100, help="""minimum number of sites [100]""") parser.add_argument( "--minsitedepth", type=int, default=1, help="""mininum depth of sites to use in alignment [1]""", ) parser.add_argument( "--minseqcoverage", type=float, default=0.1, help="""proportion of total alignment a sequence must cover to be retianed [0.1]""", ) parser.add_argument("--mindepth", type=int, default=4, help="""minimum number of sequences [4]""") parser.add_argument( "--bootstrap", type=int, help="""turn on rapid bootstrapping for RAxML and perform specified number of replicates""", ) parser.add_argument("--raxml_model", default="GTRGAMMA", help="""choose custom RAxML model [GTRGAMMA]""") parser.add_argument("--raxmlpath", help="manually specify RAxML path") parser.add_argument("--raxmlopts", default="", help="specify additional RAxML arguments") parser.add_argument( "--duplicateseq", default="dontuse", choices=["dontuse", "keep", "remove"], help="""[dontuse] remove for tree making, replace as zero-branch-length sister taxa; keep=keep in for tree making, may cause errors for RAxML; remove=remove entirely from alignment""", ) parser.add_argument("--tempdir", default="raxmltemp", help="""temporary dir. location default=./tempdir""") parser.add_argument("--tempprefix", default="mvftree", help="""temporary file prefix, default=mvftree""") parser.add_argument("--quiet", action="store_true", help="suppress screen output") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-02-26") sys.exit() ## ESTABLISH FILE OBJECTS args.contigs = args.contigs or [] mvf = MultiVariantFile(args.mvf, "read") treefile = OutputFile( args.out, headers=[ "contig", "windowstart", "windowsize", "tree", "topology", "topoid", # 'templabels', ### USED FOR DEBUGGING ### "alignlength", "aligndepth", "status", ], ) topofile = OutputFile(args.out + ".counts", headers=["rank", "topology", "count"]) sample_cols = args.samples and mvf.get_sample_indices(args.samples) or [] if args.tempdir: tmpdir = os.path.abspath(args.tempdir) else: tmpdir = os.path.abspath("./raxmltemp") if not os.path.exists(tmpdir): os.mkdir(tmpdir) os.chdir(tmpdir) ## SETUP PARAMS main_labels = mvf.get_sample_labels(sample_cols) if args.hapmode in ["randomboth", "majorminor"]: main_labels = [label + x for x in ["a", "b"] for label in main_labels] params = { "outgroups": args.raxml_outgroups or [], "rootwith": args.rootwith or [], "minsites": args.minsites, "minseqcoverage": args.minseqcoverage, "mindepth": args.mindepth, "raxmlpath": args.raxmlpath, "raxmlopts": args.raxmlopts, "duplicateseq": args.duplicateseq, "model": args.raxml_model, "bootstrap": args.bootstrap, "windowsize": args.windowsize, "hapmode": args.hapmode, "tempdir": tmpdir, "tempprefix": args.tempprefix, } ## WINDOW START INTERATION current_contig = "" window_start = 0 window = None topo_ids = {} topo_counts = {} for contig, pos, allelesets in mvf.iterentries( contigs=args.contigs, subset=sample_cols, quiet=args.quiet, no_invariant=False, no_ambig=False, no_gap=False, decode=True, ): if contig != current_contig or (args.windowsize != -1 and (pos > window_start + args.windowsize)): if window: entry = window.maketree_raxml(params) if entry["status"] != "ok": if args.outputempty: treefile.write_entry(entry) else: topo = entry["topology"] topo_counts[topo] = topo_counts.get(topo, 0) + 1 if topo not in topo_ids: topo_ids[topo] = topo_ids and max(topo_ids.values()) + 1 or 0 entry["topoid"] = topo_ids[topo] treefile.write_entry(entry) window_start = ( (contig == current_contig and args.windowsize != -1) and window_start + args.windowsize or 0 ) current_contig = contig[:] window = None window = WindowData( window_params={ "contigname": ( args.outputcontiglabels and mvf.get_contig_label(current_contig) or current_contig[:] ), "windowstart": (args.windowsize == -1 and "-1" or window_start + 0), "windowsize": args.windowsize, "labels": main_labels[:], } ) ## ADD ALLELES if args.hapmode != "none": allelesets[0] = hapsplit(allelesets[0], args.hapmode) window.append_alleles(allelesets[0], minsitedepth=args.minsitedepth) ## LAST LOOP entry = window.maketree_raxml(params) if entry["status"] != "ok": if args.outputempty: treefile.write_entry(entry) else: topo = entry["topology"] topo_counts[topo] = topo_counts.get(topo, 0) + 1 if topo not in topo_ids: topo_ids[topo] = topo_ids and max(topo_ids.values()) + 1 or 0 entry["topoid"] = topo_ids[topo] treefile.write_entry(entry) window = None ## END WINDOW ITERATION topo_list = sorted([(v, k) for k, v in topo_counts.iteritems()], reverse=True) for rank, [value, topo] in enumerate(topo_list): topofile.write_entry({"rank": rank, "count": value, "topology": topo}) return ""
def main(arguments=sys.argv[1:]): """Main method for mvf_join""" parser = argparse.ArgumentParser(description=""" MVF joining both veritically (separate contigs) and and horizontally (different samples)""") parser.add_argument("mvf", nargs="*", help="one or more mvf files") parser.add_argument("--out", help="output mvf file") parser.add_argument("--newcontigs", action="store_true", help="Don't match contigs using labels (not IDs)") parser.add_argument("--newsamples", action="store_true", help="Don't match samples using labels") parser.add_argument("--linebuffer", type=int, default=100000, help="number of entries to write in a block") parser.add_argument("--main_header_file", help="""name of MVF file to use the headers from (default=first in list)""") parser.add_argument("--overwrite", action="store_true", help="USE WITH CAUTION: force overwrite of outputs") parser.add_argument("--quiet", action="store_true", help="suppress progress meter") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-02-01: Initial Public Release") sys.exit() concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) ## Copy the first file's metadata if args.main_header_file: if args.main_header_file not in args.mvf: raise RuntimeError("{} not found in files".format( args.main_header_file)) else: args.main_header_file = args.mvf.index(args.main_header_file) else: args.main_header_file = 0 first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read') concatmvf.metadata = first_mvf.metadata.copy() ## Open each MVF file, read headers to make unified header transformers = [] for mvfname in args.mvf: ## This will create a dictionary of samples{old:new}, contigs{old:new} transformer = MvfTransformer() mvf = MultiVariantFile(mvfname, 'read') for i, label in enumerate(mvf.get_sample_labels()): if label not in concatmvf.get_sample_labels(): concatmvf.metadata['labels'].append(label) concatmvf.metadata['samples'][ concatmvf.metadata['labels'].index(label)] = { 'label': label} if concatmvf.metadata['labels'].index(label) != i: transformer.set_label( i, concatmvf.metadata['labels'].index(label)) for contigid, contigdata in mvf.metadata['contigs'].iteritems(): if contigdata['label'] not in [ concatmvf.metadata['contigs'][x]['label'] for x in concatmvf.metadata['contigs']]: newid = (contigid not in concatmvf.metadata['contigs'] and contigid or concatmvf.get_next_contig_id()) concatmvf.metadata['contigs'][newid] = contigdata else: for concatid, concatdata in ( concatmvf.metadata['contigs'].items()): if contigdata['label'] == concatdata['label']: newid = concatid break if newid != contigid: transformer.set_contig(contigid, newid) transformers.append(transformer) ## Write output header concatmvf.write_data(concatmvf.get_header()) ## Now loop through each file entries = [] nentries = 0 for ifile, mvfname in enumerate(args.mvf): if not args.quiet: sys.stderr.write("Processing {} ...\n".format(mvfname)) transformer = transformers[ifile] mvf = MultiVariantFile(mvfname, 'read') for contigid, pos, allelesets in mvf.iterentries(decode=False, quiet=args.quiet): if transformer.labels: allelesets = [mvf.decode(x) for x in allelesets] for j, alleles in enumerate(allelesets): allelesets[j] = concatmvf.encode(''.join([ x in transformer.labels and alleles[transformer.labels[x]] or alleles[x] for x in xrange(len(alleles))])) if transformer.contigs: contigid = (contigid in transformer['contigs'] and transformer['contigs'][contigid] or contigid) entries.append((contigid, pos, allelesets)) nentries += 1 if nentries == args.linebuffer: concatmvf.write_entries(entries) entries = [] nentries = 0 if entries: concatmvf.write_entries(entries) entries = [] nentries = 0 if not args.quiet: sys.stderr.write("done\n") return ''
def calc_pairwise_dnds(args): """Calculates Pairwise dNdS using PAML among pairse of sequences """ mvf = MultiVariantFile(args.mvf, 'read') annotations = {} coordinates = {} if args.gff: annotations, coordinates = (parse_gff_annotate(args.gff)) labels = mvf.get_sample_labels()[:] ncol = len(labels) current_contig = None current_position = 0 counts = Counter() totals = Counter() if self.params['output_align']: outputalign = [] fieldtags = [ 'likelihood', 'bgdnds0', 'bgdnds1', 'bgdnds2a', 'bgdnds2b', 'fgdnds0', 'fgdnds1', 'fgdnds2a', 'fgdnds2b', 'dndstree', 'errorstate' ] with open(self.params['branchlrt'], 'w') as branchlrt: genealign = [] branchlrt.write("\t".join( ['contig', 'ntaxa', 'alignlength', 'lrtscore'] + ["null.{}".format(x) for x in fieldtags] + ["test.{}".format(x) for x in fieldtags] + ['tree']) + "\n") groups = self.params['allele_groups'].values() speciesgroups = self.params['speciesgroups'].values() allsets = set([]) for group in groups: allsets.update(group) allsets = list(sorted(allsets)) speciesrev = {} for species in self.params['speciesgroups']: speciesrev.update([(x, species) for x in self.params['speciesgroups'][species]]) if self.params['mincoverage']: if self.params['mincoverage'] < len(groups) * 2: raise RuntimeError(""" Error: GroupUniqueAlleleWindow: --mincoverage cannot be lower than the twice the number of specified groups in --allele-groups """) for contig, pos, allelesets in mvf: if not current_contig: current_contig = contig[:] if contig != current_contig or ( self.params['windowsize'] != -1 and pos > current_position + self.params['windowsize']): xkey = ( current_contig, current_position, ) self.data[xkey] = counts.copy() self.data[xkey].update([ ('contig', (self.params['uselabels'] and mvf.get_contig_label(current_contig))), ('position', current_position), ('nonsynyonymous_changes', counts.get('nonsynonymous_changes', 0) or 0), ('synyonymous_changes', counts.get('synonymous_changes', 0) or 0) ]) self.data[xkey].update([ ('ns_ratio', (float(self.data[xkey].get('nonsynonymous_changes', 0)) / (self.data[xkey].get('synonymous_changes', 1.0)))), ('annotation', annotations.get(self.data[xkey]['contig'], '.')), ('coordinates', coordinates.get(self.data[xkey]['contig'], '.')) ]) if genealign: if (self.params.get('endcontig', 1000000) >= int(current_contig)) and (self.params.get( 'startcontig', 0) <= int(current_contig)): # print(current_contig) (dnval, dsval) = paml_pwcalc_dnds(genealign) with open(self.params['branchlrt'], 'a') as branchlrt: branchlrt.write("\t".join([ str(x) for x in [ self.data[xkey]['contig'], len(genealign), len(genealign[0]) * 3, dnval, dsval ] ]) + "\n") genealign = None totals.add('genes_total') if counts.get('total_codons', 0) > 0: totals.add('genes_tested') if counts.get('total_nsyn_codons', 0) > 0: totals.add('genes_with_nsyn') if contig != current_contig: current_contig = contig[:] current_position = 0 elif self.params['windowsize'] != -1: current_position += self.params['windowsize'] counts = Counter() proteins = allelesets[0] codons = allelesets[1:4] if len(proteins) == 1 and all(len(x) == 1 for x in codons): if proteins == '*' or ''.join(codons) in MLIB.stop_codons: continue counts.add('total_codons') totals.add('total_codons') if self.params['output_align']: if not outputalign: outputalign = [[''.join(codons)] for x in range(mvf.metadata['ncol'])] else: for ialign in range(len(outputalign)): outputalign[ialign].append(''.join(codons)) if self.params['branchlrt']: if not genealign: genealign = [[''.join(codons)] for x in range(ncol)] else: for ialign in range(len(genealign)): genealign[ialign].append(''.join(codons)) continue if len(proteins) > 1: if allelesets[0][1] == '+': continue proteins = mvf.decode(proteins) if self.params['mincoverage']: if sum([int(x not in 'X-') for x in proteins]) < (self.params['mincoverage']): continue species_groups = [[proteins[i] for i in x if proteins[i] not in '-X'] for x in speciesgroups] if any(len(x) == 0 for x in species_groups): continue xcodons = [mvf.decode(x) for x in codons] codons = [''.join(x) for x in zip(*xcodons)] if any(codons[x] in MLIB.stop_codons for x in allsets): continue if any( any(x != species_groups[0][0] for x in y) for y in species_groups): totals.add('total_nsyn_codons') counts.add('total_nsyn_codons') totals.add('total_codons') totals.add('tested_codons') counts.add('total_codons') totals.add('variable_codons', val=int( sum([int(len(set(x) - set('X-')) > 1) for x in xcodons]) > 0)) if self.params['output_align']: if not outputalign: outputalign = [[x] for x in codons] else: for ialign in range(len(outputalign)): outputalign[ialign].append(codons[ialign]) if self.params['branchlrt']: if not genealign: genealign = [[x] for x in codons] else: for ialign in range(len(codons)): genealign[ialign].append(codons[ialign]) nonsyn_change = False synon_change = False codon_groups = [ set([ codons[i] for i in x if '-' not in codons[i] and 'X' not in codons[i] ]) for x in groups ] protein_groups = None for i in range(len(codon_groups)): if any(base in codon for base in 'RYWKMS' for codon in codon_groups[i]): codon_groups[i] = hapgroup(codon_groups[i]) if all( grp1.isdisjoint(grp0) for grp0, grp1 in combinations(codon_groups, 2)): protein_groups = [ set([ MLIB.codon_table['full'][''.join(x)] for x in codon_groups[i] ]) for i in range(len(codon_groups)) ] if all( grp1.isdisjoint(grp0) for grp0, grp1 in combinations(protein_groups, 2)): nonsyn_change = True elif all(grp1 == grp0 for grp0, grp1 in combinations(protein_groups, 2)): synon_change = True if nonsyn_change: print('NON', contig, pos, allelesets, codon_groups, protein_groups, groups, mvf.get_contig_label(contig)) counts.add('nonsynonymous_changes') totals.add('nonsynonymous_changes') elif synon_change: print('SYN', contig, pos, allelesets, codon_groups, protein_groups, groups, mvf.get_contig_label(contig)) counts.add('synonymous_changes') totals.add('synonymous_changes') self.params['totals'] = totals self.write() if self.params['output_align']: with open(self.params['output_align'], 'w') as alignfile: alignfile.write("\n".join([ ">{}\n{}".format(mvf.metadata['labels'][i], ''.join(outputalign[i])) for i in range(len(outputalign)) ])) return ''
def main(arguments=sys.argv[1:]): """Main method for mvf2fasta""" parser = argparse.ArgumentParser(description=""" Process MVF into FASTA alignment""") parser.add_argument("--mvf", help="input MVF file", required=True) parser.add_argument("--out", help="target FASTA file", required=True) parser.add_argument("--labeltype", choices=['long', 'short'], default='long', help="long labels with all metadata or short ids") parser.add_argument("--regions", nargs='*', help="one or more regions id,start,stop (inclusive)") parser.add_argument("--samples", nargs='*', help="one or more taxon labels, leave blank for all") parser.add_argument("--outgroups", nargs="*") parser.add_argument("--contigs", nargs='*', help="one or more taxon labels, leave blank for all") parser.add_argument("--buffer", type=int, default=10, help="size (Mbp) of write buffer for each sample") parser.add_argument("--tmpdir", default=".", help="directory to write temporary fasta files") parser.add_argument("--quiet", action="store_true", default=True, help="suppress screen output") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-02-01: Initial Public Release") sys.exit() mvf = MultiVariantFile(args.mvf, 'read') if args.contigs: contigs = dict(mvf.metadata['contigs'][c] for c in args.contigs) else: contigs = dict(mvf.metadata['contigs']) sample_cols = mvf.get_sample_indices(args.samples or None) labels = mvf.get_sample_labels(sample_cols) current_contig = None tmp_files = dict((fn, open(fn+'.tmp', 'w+', args.buffer)) for fn in labels) for contig, _, allelesets in mvf.iterentries( contigs=args.contigs, subset=sample_cols, quiet=args.quiet, decode=True): alleles = mvf.decode(allelesets) if current_contig != contig: current_contig = contig for col, label in zip(sample_cols, labels): if args.labeltype == 'long': tmp_files[label].write( '\n>{} contig={} length={}\n{}'.format( label, contigs[current_contig]['label'], contigs[current_contig]['length'], alleles[col])) elif args.labeltype == 'short': tmp_files[label].write( '\n>{}_{}\n{}'.format( label, contigs[current_contig]['label'], alleles[col])) else: for col, label in zip(sample_cols, labels): tmp_files[label].write(alleles[col]) with open(args.out, 'w') as outfile: for filehandler in tmp_files.values(): filehandler.seek(0, 0) buff = filehandler.read(args.buffer) while len(buff): outfile.write(buff) buff = filehandler.read(args.buffer) filehandler.close() os.remove(os.path.join(args.tmpdir, filehandler.name)) return ''