def setupdir(strains, genomedb): try: os.makedirs(outdir) except OSError as exc: if exc.errno == errno.EEXIST: print "Database folder exists:", outdir pp.createdirs(outdir, ["faa", "m8", "out", "paranoid_output", "dmnd_tmp"]) if not os.path.isdir(genomedb): print "GenomeDB folder", genomedb, "doesn't exist...exiting." sys.exit() if not os.path.isdir(os.path.join(genomedb, "pep")): print "GenomeDB folder is missing a 'pep' folder...exiting." sys.exit() if verbose: print "Formatting", len(strains), "fasta files..." for s in strains: try: i = open(os.path.join(genomedb, "pep", s + ".pep.fa"), "r") except IOError as exc: if exc.errno == 2: print s, 'not found in database...check your strainlist.' sys.exit() o = open(os.path.join(outdir, "faa", s + ".faa"), "w") for seq in SeqIO.parse(i, 'fasta'): seq.id = s + "|" + str(seq.id) SeqIO.write(seq, o, 'fasta') o.close() return
def main(): args = parse_args() global prefix prefix = os.path.abspath(args.prefix) global outdir outdir = os.path.abspath(args.outdir) pp.createdirs(prefix, ["orthos", "ortho_align", "hmms"]) if args.strains: strains = [ line.rstrip() for line in open(os.path.abspath(args.strains), 'r') ] else: strains = get_strains() if args.orthos: orthos = [ line.rstrip() for line in open(os.path.abspath(args.orthos), 'r') ] else: if args.threshold: orthos = parse_threshold_matrix(args.threshold, strains) else: orthos = parse_matrix(strains) if args.cpus: cpus = args.cpus else: cpus = mp.cpu_count() global use_MP if args.use_MP: use_MP = True else: use_MP = False index_hmms() extract_hmms(orthos) get_orthos(orthos, strains) align_orthos(orthos, cpus) create_master_alignment(orthos, strains) if args.clean: pp.cleanup(os.path.join(prefix, "ortho_align")) pp.cleanup(os.path.join(prefix, "orthos")) pp.cleanup(os.path.join(prefix, "hmms"))
def main(): args = parse_args() global pypath pypath = os.path.abspath(os.path.dirname(sys.argv[0])) global outdir outdir = os.path.abspath(args.outdir) genomedb = os.path.abspath(args.genomedb) new_strains = [ line.rstrip() for line in open(os.path.abspath(args.new_strainlist), 'r') ] global cpus if args.cpus: cpus = args.cpus else: cpus = mp.cpu_count() global use_MP if args.use_MP: use_MP = True else: use_MP = False pp.createdirs(outdir, [ "prop_faa", "prop_dmnd", "prop_m8", "prop_out", "prop_paranoid_output", "prop_homolog_faa" ]) check_strains(new_strains, genomedb) make_diamond_databases(new_strains) run_diamond(new_strains) genes = get_genes(new_strains) parse_diamond(genes) run_inparanoid(new_strains, pypath) group_members = parse_inparanoid(new_strains) extract_fastas(genes, group_members) pp.dump_matrices(outdir) for f in [ "prop_m8", "prop_out", "prop_dmnd", "prop_paranoid_output", "prop_faa", "prop_homolog_faa" ]: pp.cleanup(os.path.join(outdir, f))
def main(): args = parse_args() genomedb = os.path.abspath(args.genomedb) strains = [x.rstrip() for x in open(os.path.abspath(args.strainlist), 'r')] if len(set(strains)) != len(strains): print "Duplicate entry in strainlist! Exiting..." global outdir, pypath outdir = os.path.abspath(args.outdir) pypath = os.path.abspath(os.path.dirname(sys.argv[0])) if args.mode: if args.mode not in [ "multi_setup", "parse", "extract", "cluster", "build" ]: print "Unknown mode!!! Exiting..." sys.exit() global cpus if args.cpus: cpus = args.cpus else: cpus = mp.cpu_count() global clean if args.clean: clean = True else: clean = False global verbose if args.verbose: verbose = True else: verbose = False global inflate if args.inflate: inflate = args.inflate else: inflate = 2.0 global threshold if args.threshold: threshold = args.threshold else: threshold = 0 global multi if args.multi: multi = True else: multi = False global use_MP if args.use_MP: use_MP = True else: use_MP = False if not args.mode or args.mode == "multi_setup": setupdir(strains, genomedb) shutil.copy(os.path.abspath(args.strainlist), os.path.join(outdir, "strainlist.txt")) make_diamond_databases(strains) run_diamond(strains) if not args.mode or args.mode == "parse": genes = get_genes(strains) parse_diamond(genes, strains) run_inparanoid(strains) if not args.mode or args.mode == "cluster": if clean: pp.cleanup(os.path.join(outdir, "out")) pp.createdirs(outdir, ["mcl"]) create_abc_file() mcxload() mcl_cluster() mcxdump() if not args.mode or args.mode == "extract": seqdata, desc, seq_number = hash_fastas() pp.createdirs( outdir, ["homolog_faa", "clustered", "aligned", "hmms", "consensus_seqs"]) parse_clusters(strains, seq_number) parse_groups(seqdata, desc) if not args.mode or args.mode == "build": cdhit_seqs() align_groups() if clean: pp.cleanup(os.path.join(outdir, "clustered")) build_hmms() if clean: pp.cleanup(os.path.join(outdir, "aligned")) emit_consensus_seqs() combine_seqs() combine_homologs() if clean: pp.cleanup(os.path.join(outdir, "hmms")) pp.cleanup(os.path.join(outdir, "consensus_seqs")) pp.cleanup(os.path.join(outdir, "m8")) pp.cleanup(os.path.join(outdir, "paranoid_output")) pp.cleanup(os.path.join(outdir, "dmnd_tmp")) pp.cleanup(os.path.join(outdir, "faa")) pp.cleanup(os.path.join(outdir, "homolog_faa")) pp.cleanup(os.path.join(outdir, "mcl")) os.remove(os.path.join(outdir, "all_strains.dmnd")) pp.dump_matrices(outdir)