def run_fanngo(config): workdir = config["input"]["gomap_dir"] + "/" fanngo_sw_conf = config["data"]["mixed-method"]["fanngo"] fanngo_conf = config["software"]["fanngo"] fanngo_template = fanngo_conf["path"] + "/" + fanngo_conf["template"] run_file_path = workdir + fanngo_sw_conf["out_dir"] + "/" + config[ "input"]["basename"] + ".fanngo.m" #print fanngo_template conf_lines = open(fanngo_template, "r").readlines() run_file = open(run_file_path, "w") cwd = os.getcwd() output = workdir + run_file_path out_score = workdir + fanngo_sw_conf["out_dir"] + "/" + config["input"][ "basename"] + ".score.txt" input_fasta = workdir + "input/" + config["input"]["fasta"] for line in conf_lines: line = line.strip() if line.find("$PATH") > -1: code_path = cwd + "/" + fanngo_conf["path"] + "/code" outline = line.replace("$PATH", code_path) print >> run_file, outline elif line.find("$INPUT_FASTA") > -1: outline = line.replace("$INPUT_FASTA", input_fasta) print >> run_file, outline elif line.find("$OUTPUT_SCORE") > -1: outline = line.replace("$OUTPUT_SCORE", out_score) print >> run_file, outline else: print >> run_file, line run_file.close() cmd = ["/matlab/bin/matlab", "-nojvm", "-nodisplay", "-nosplash"] print(" ".join(cmd)) check_output_and_run(out_score, cmd, run_file_path)
def run_fanngo_split(config, split_fa): workdir = config["input"]["gomap_dir"] + "/" fanngo_sw_conf = config["data"]["mixed-method"]["fanngo"] fanngo_conf = config["software"]["fanngo"] fanngo_template = fanngo_conf["template"] out_base = os.path.basename(split_fa.replace(r".fa", "")) run_file_path = workdir + fanngo_sw_conf[ "out_dir"] + "/split/" + out_base + ".fanngo.m" #print fanngo_template conf_lines = open(fanngo_template, "r").readlines() run_file = open(run_file_path, "w") cwd = os.getcwd() output = workdir + run_file_path out_score = workdir + fanngo_sw_conf[ "out_dir"] + "/split/" + out_base + ".score.txt" input_fasta = workdir + "input/" + config["input"]["fasta"] print(split_fa) with open(run_file_path, "w") as run_file: generate_fanngo_file(conf_lines, cwd, fanngo_conf, split_fa, out_score, run_file) run_file.close() cmd = [ "octave", "--norc", "--no-window-system", "--quiet", "--no-history", "--traditional", "--verbose" ] os.environ["NPROC"] = str(config["input"]["cpus"]) print(run_file_path) check_output_and_run(out_score, cmd, run_file_path)
def get_rbh_annotations(config): out_file = config["input"]["gomap_dir"] + "/" + config["data"]["seq-sim"][ "uniprot"]["tmpdir"] + "/test" command = [ "Rscript", "code/pipeline/run_rbh.r", config["input"]["config_file"] ] check_output_and_run(out_file, command)
def run_uniprot_blast(config): pipeline_loc = config["pipeline"]["pipeline_loc"] + "/" uniprot_config = config["data"]["seq-sim"]["uniprot"] uniprot_fa = pipeline_loc + uniprot_config[ "basedir"] + "/" + uniprot_config["basename"] + ".fa" blast_config = config["software"]["blast"] blast_bin = pipeline_loc + blast_config["bin"] + "/blastp" workdir = config["input"]["gomap_dir"] + "/" input_fa = workdir + "/input/" + config["input"]["fasta"] tmp_base_dir = workdir + uniprot_config["tmpdir"] #running main vs other blast. main2other_file = tmp_base_dir + "/" + config["input"][ "basename"] + "-vs-" + uniprot_config["basename"] + ".bl.out" main2other_cmd = [ blast_bin, "-outfmt", outcols, "-db", uniprot_fa, "-query", input_fa, "-out", main2other_file, "-num_threads", str(config["input"]["cpus"]) ] check_output_and_run(main2other_file, main2other_cmd) #running other vs main blast other2maize_file = tmp_base_dir + "/" + uniprot_config[ "basename"] + "-vs-" + config["input"]["basename"] + ".bl.out" other2maize_cmd = [ blast_bin, "-outfmt", outcols, "-db", input_fa, "-query", uniprot_fa, "-out", other2maize_file, "-num_threads", str(config["input"]["cpus"]) ] check_output_and_run(other2maize_file, other2maize_cmd)
def run_pannzer(config): workdir=config["input"]["gomap_dir"]+"/" pannzer_data=config["data"]["mixed-method"]["pannzer"] pannzer_conf=config["software"]["pannzer"] blast_dir=workdir + pannzer_data["preprocess"]["blast"] blast_files = glob(blast_dir+"/*.xml") cwd = os.getcwd() os.chdir(cwd+"/"+pannzer_conf["path"]) for blast_file in blast_files: blank_config = ConfigParser.ConfigParser() blank_config.read(pannzer_conf["conf_template"]) blank_config.set("GENERAL_SETTINGS","INPUT_FOLDER",cwd+"/"+pannzer_data["preprocess"]["blast"]) blank_config.set("GENERAL_SETTINGS","INPUT_FILE",blast_file) blank_config.set("GENERAL_SETTINGS","RESULT_FOLDER",workdir+pannzer_data["result_dir"]) #blank_config.set("GENERAL_SETTINGS","db",cwd+"/"+pannzer_conf["path"]+"/db") blank_config.set("GENERAL_SETTINGS","QUERY_TAXON",config["input"]["taxon"]) out_base = os.path.basename(blast_file).replace(".xml","") out_conf = workdir+pannzer_data["conf_dir"]+"/"+out_base+".conf" blank_config.set("GENERAL_SETTINGS","RESULT_BASE_NAME",out_base) blank_config.set("MYSQL","SQL_DB_HOST",pannzer_conf["database"]["SQL_DB_HOST"]) blank_config.set("MYSQL","SQL_DB_PORT",pannzer_conf["database"]["SQL_DB_PORT"]) blank_config.set("MYSQL","SQL_DB_USER",pannzer_conf["database"]["SQL_DB_USER"]) blank_config.set("MYSQL","SQL_DB_SOCKET",pannzer_conf["database"]["SQL_DB_SOCKET"]) # blank_config.set("MYSQL","SQL_DB_PASSWORD",pannzer_conf["database"]["SQL_DB_PASSWORD"]) blank_config.set("MYSQL","SQL_DB",pannzer_conf["database"]["SQL_DB"]) #pp.pprint(blank_config.items("GENERAL_SETTINGS")) # pprint(blank_config.items("GENERAL_SETTINGS")) blank_config.write(open(out_conf,"w")) pannzer_out = blank_config.get("GENERAL_SETTINGS","RESULT_FOLDER")+"/"+out_base + "_results.GO" pannzer_cmd = ["python","run.py",out_conf] check_output_and_run(pannzer_out,pannzer_cmd) os.chdir(cwd)
def iprs2gaf(config): dom_config = config["data"]["domain"] input_config = config["input"] workdir = config["input"]["gomap_dir"] + "/" tsv_base = workdir + dom_config["tmpdir"] + "/" + config["input"][ "basename"] infile = tsv_base + ".tsv" tmpfile = tsv_base + ".go.tsv" gaf_dir = workdir + config["data"]["gaf"]["raw_dir"] + "/" tmp_iprs = open(tmpfile, "w") with open(infile, "r+") as raw_iprs: for line in raw_iprs: if re.search("GO:", line) is not None: tmp_iprs.write(line) #print >> tmp_iprs, line tmp_iprs.close() out_gaf = gaf_dir + os.path.basename(infile) tool_ext = "." + config["data"]["domain"]["tool"]["name"] + ".gaf" out_gaf = re.sub(".tsv", tool_ext, out_gaf) cmd = [ "Rscript", "code/pipeline/iprs2gaf.r", config["input"]["config_file"] ] check_output_and_run(out_gaf, cmd)
def setup(config): setlogging(config, "setup") """ setup(config) This function downloads the **GOMAP-data.tar.gz** directory from CyVerse and extracts the content to the **data** directory. The steps run by this function is given below 1. asdsdsa 2. sadsadsad 3. sadsadsad Parameters ---------- config : dict The config dict generated in the gomap.py script. """ outdir = "data/" cmd = ["irsync", "-rv", cyverse_path, outdir] logging.info("Downloading file from Cyverse using irsync") #The irsync will checksum the files on both ends and dtermine if the download is necessary and will only download if necessary # might take time to check if the files needs to be downloaded print(os.getcwd()) print(" ".join(cmd)) check_output_and_run("outfile", cmd) with open("data/compress_files.txt", "r") as comp_files: counter = 0 for infile in comp_files.readlines(): counter = counter + 1 outfile = outdir + infile.strip() gzfile = outdir + infile.strip() + ".gz" if os.path.exists(gzfile): if os.path.exists(outfile): print(gzfile + " already extracted") else: print("Extracting " + gzfile) with gzip.open(gzfile, "rb") as in_f: with open(outfile, "wb") as out_f: shutil.copyfileobj(in_f, out_f) os.remove(gzfile) else: print(gzfile + " doesn't exist") with open("data/tar_files.txt", "r") as comp_files: for infile in comp_files.readlines(): infile = infile.strip() outfile = outdir + infile.strip() tar_f = outdir + infile.strip() + ".tar.gz" base_dir = os.path.basename(outfile) if os.path.exists(tar_f): if os.path.exists(outfile): print(tar_f + " already extracted") else: print("Extracting " + tar_f) with tarfile.open(tar_f) as tar: tar.extractall("data/") os.remove(tar_f) else: print(tar_f + " doesn't exist")
def make_blastdb(in_fasta, config): fasta_db = in_fasta + ".phr" makedb_command = [ config["pipeline"]["pipeline_loc"] + "/" + config["software"]["blast"]["bin"] + "/makeblastdb", "-in", in_fasta, "-dbtype", "prot", "-out", in_fasta, "-title", in_fasta, "-hash_index" ] check_output_and_run(fasta_db, makedb_command)
def run_hmmer(config): workdir = config["input"]["gomap_dir"] + "/" fa_dir = workdir + config["input"]["split_path"] fa_files = natsorted(glob(fa_dir + "/*fa")) hmmer_bin = config["software"]["hmmer"]["path"] + "/hmmscan" hmmerdb = config["data"]["mixed-method"]["preprocess"]["hmmerdb"] cpu = str(config["input"]["cpus"]) tmp_file = workdir + "hmmscan.tmp" num_seqs = int(config["input"]["num_seqs"]) chunks = [] counter_start = 0 counter_curr = -1 chunk_seqs = 0 chunk_count = 0 all_seqs = [] for fa_file in fa_files: counter_curr = counter_curr + 1 seqs = list(SeqIO.parse(fa_file, "fasta")) num_fa_records = len(seqs) chunk_seqs = chunk_seqs + num_fa_records all_seqs = all_seqs + seqs if chunk_seqs % num_seqs == 0 or fa_file == fa_files[-1]: chunk_count = chunk_count + 1 out_fa = workdir + config["data"]["mixed-method"]["argot2"][ "preprocess"]["hmmer"] + "/" + config["input"][ "basename"] + "." + str(chunk_count) + ".fa" print(out_fa) SeqIO.write(all_seqs, out_fa, "fasta") all_seqs = [] chunk_seqs = 0 hmmer_dir = workdir + config["data"]["mixed-method"]["argot2"][ "preprocess"]["hmmer"] fa_files = glob(hmmer_dir + "/*fa") for infile in fa_files: outfile = re.sub("\.fa", ".hmm.out", infile) cmd = [ hmmer_bin, "-o", tmp_file, "--tblout", outfile, "--cpu", cpu, hmmerdb, infile ] zipfile_loc = outfile + ".zip" check_output_and_run(zipfile_loc, cmd) if os.path.exists(outfile): zf = zipfile.ZipFile(zipfile_loc, 'w', zipfile.ZIP_DEFLATED) zf.write(outfile, os.path.basename(outfile)) if os.path.isfile(tmp_file): os.remove(tmp_file)
def run_iprs(fa_file, config, iprs_loc=None): dom_config = config["data"]["domain"] iprs_config = config["software"]["iprs"] workdir = config["input"]["gomap_dir"] + "/" out_file = re.sub( r"\.fa$", "", workdir + "/" + dom_config["split_path"] + "/" + os.path.basename(fa_file)) temp_dir = workdir + dom_config["tmpdir"] + "/temp" if iprs_loc is None: iprs_loc = iprs_config["path"] cmd = [ iprs_loc + "/interproscan.sh", "-goterms", "-pa", "-i", fa_file, "-dp", "-b", out_file, "-T", temp_dir, "-cpu", str(config["input"]["cpus"]) ] + iprs_config["options"] check_out = out_file + ".tsv" check_output_and_run(check_out, cmd)
def run_blast(fa_file, blast_db, config): in_file = fa_file out_file = re.sub(r'fa$', "xml", fa_file) blast_config = config["software"]["blast"] workdir = config["input"]["gomap_dir"] + "/" blast_opts = config["data"]["mixed-method"]["preprocess"]["blast_opts"] skip_blast = check_bl_out(in_file, out_file) if skip_blast: logging.info( out_file + " already exists.\n The number of sequences in output match input") else: blast_cmd = [ blast_config["bin"] + "/blastp", "-outfmt", "5", "-db", blast_db, "-query", in_file, "-out", out_file, "-num_threads", str(config["input"]["cpus"]) ] + blast_opts print(" ".join(blast_cmd)) check_output_and_run(out_file, blast_cmd)
def download_mysql_data(config): outdir = "/var/lib/mysql/" if os.path.isdir(outdir): os.mkdir(outdir) outfile = "/var/lib/mysql/pannzer/uniprot.MYI" cmd = ["irsync", "-rv", cyverse_path, outdir] logging.info("Downloading file from Cyverse using irsync") print(" ".join(cmd)) check_output_and_run(outfile, cmd) gz_files = glob("/var/lib/mysql/pannzer/*.gz") if len(gz_files) > 0: for gz_file in gz_files: print("Extracting " + gz_file) outfile = gz_file.replace(".gz", "") with gzip.open(gzfile, "rb") as in_f: with open(outfile, "wb") as out_f: shutil.copyfileobj(in_f, out_f) os.remove(gzfile) else: print("No mysql files to extract")
def make_uniprotdb(config): uniprot_fa = config["mixed-method"]["preprocess"]["uniprot_db"] + ".fa" uniprot_db = config["mixed-method"]["preprocess"]["uniprot_db"] uniprot_db_dir = os.path.dirname(uniprot_db) db_dir = os.path.dirname(uniprot_db) files = os.listdir("mixed-method/data/blastdb/") db_pattern = os.path.basename(uniprot_db) db_pattern = re.compile(db_pattern + ".*phd") db_exist = [(1 if db_pattern.match(tmp_file) is not None else 0) for tmp_file in files] makedb_cmd = [ "makeblastdb", "-in", uniprot_fa, "-dbtype", "prot", "-out", uniprot_db, "-parse_seqids", "-hash_index", "-max_file_sz", "10GB" ] if 1 in db_exist: logging.warn( "The Uniprot blast database already exists, if not remove the database files to recreate the database" ) logging.info(makedb_cmd) else: check_output_and_run("temp/uniprotdb", makedb_cmd)
def tair_go2gaf(in_go, out_gaf, config_file): logging.info("Converting TAIR GO file to GAF format") basic_utils.check_output_and_run( out_gaf, ["Rscript", "code/R/tair2gaf.r", config_file])
def clean_redundant(config): command = [ "Rscript", "code/pipeline/clean_redundancy.r", config["input"]["config_file"] ] check_output_and_run("test.pod", command)
def aggregate_datasets(config): command = [ "Rscript", "code/pipeline/aggregate_datasets.r", config["input"]["config_file"] ] check_output_and_run("test.pod", command)
def fanngo2gaf(config): command = [ "Rscript", "code/pipeline/fanngo2gaf.R", config["input"]["config_file"] ] check_output_and_run("test.pod", command)
def clean_duplicate(config): command = [ "Rscript", "code/pipeline/clean_duplicate.r", config["input"]["config_file"] ] check_output_and_run("test.pod", command)
def filter_mixed(config): command = ["Rscript","code/pipeline/filter_mixed.r",config["input"]["config_file"]] check_output_and_run("test.pod",command)