def __init__(self, filter_id, group_id, req, out_dir=".", order = None, min_identity = 30, min_blast_evalue = 0.01, save_to_ITOL = False, sep = ",", report_unfit = False): self.req = req self.order = order self.filter_id = filter_id self.group_id = group_id self.out_dir = os.path.abspath(out_dir) self.min_identity = min_identity self.min_blast_evalue = min_blast_evalue self.save_to_ITOL = save_to_ITOL self.sep = sep self.report_unfit = report_unfit if self.order == None: ## user didn't override the order argument d = os.path.abspath(os.path.dirname(__file__)) data_env = os.path.join(d, 'data/') self.order = utils.get_order(self.req,data_env) ## get blast versions self.blastp = self._get_blast_version("blastp") self.makeblastdb = self._get_blast_version("makeblastdb") utils.assure_path_exists(os.path.join(self.out_dir,group_id+ "_GROUP")) utils.write_log(os.path.join(self.out_dir,group_id + "_GROUP", "LOG"), "STEP 4 : GROUP ORFs", vars(self), "")
def main(): gpu = '0' # which GPU to use for experiment data_path = './cub200_resnet' # path to data save_path = './results' # path to save results os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = gpu utils.assure_path_exists(save_path) dataset = 'cub200' capacity_list = utils.get_dataset_params(dataset) experiment_types = ['iid', 'class_iid'] # loop over all experiment types for experiment_type in experiment_types: print('Experiment Type: ', experiment_type) # load and sort data X_train, y_train, X_test, y_test = utils.load_cub200( data_path, experiment_type) # loop over desired capacities for c in capacity_list: print('Capacity: ', c) acc, mpca = experiment(dataset, X_train, y_train, X_test, y_test, c) # save results out acc_name = 'acc_' + experiment_type + '_' + dataset + '_exstream_capacity_' + str( c) mpca_name = 'mpca_' + acc_name np.save(save_path + '/' + acc_name, np.asarray(acc)) np.save(save_path + '/' + mpca_name, np.asarray(mpca))
def write_matrix_to_file(args, group_dir, network_type, binary): ''' write the matrix to a file, both as a CSV and for reading into ITOL if asked''' with open(os.path.join(group_dir, network_type + ".csv"), "wb") as f: writer = csv.writer(f) writer.writerows(binary) if args.itol: ## get the maximum value of the matrix for ITOL max_val = [item for sublist in binary for item in sublist] max_val = [x for x in max_val if isinstance(x, int)] max_val = max(max_val) utils.assure_path_exists(os.path.join(group_dir, "ITOL")) with open(os.path.join(group_dir, "ITOL", network_type + ".txt"), "wb") as f: f.write("DATASET_HEATMAP\nSEPARATOR COMMA\nDATASET_LABEL," + network_type + "\nCOLOR,#ff0000\nFIELD_LABELS," + ",".join(binary[0][1:]) + "\nDATA\n") for i in range(1, len(binary)): f.write(",".join(map(str, binary[i])) + "\n") strains = [row[0] for row in binary][1:] for i in range(1, len(binary[0])): feature_vec = [row[i] for row in binary] ta = feature_vec[0] feature_vec = feature_vec[1:] generate_single_itol_output( group_dir, feature_vec, ta, strains, network_type, max_val) ## create a single file for one system return
def run(args): args = copy.deepcopy(args) if "filter_id" not in vars(args): filter_id = args.id elif args.filter_id is None: filter_id = args.id else: filter_id = args.filter_id args.out_dir = os.path.abspath(args.out_dir) filter_dir = os.path.join(args.out_dir, filter_id + "_FILTER") group_dir = os.path.join(args.out_dir, args.id + "_GROUP") utils.assure_path_exists(group_dir) blast_dir = os.path.join(group_dir, "blast_files") if args.order is None: ## user didn't override the order argument d = os.path.abspath(os.path.dirname(__file__)) data_env = os.path.join(d, 'data/') args.order = utils.get_order(args.hmm_db, data_env) ## write LOG file get_blast_version(args.blastp, "blastp") get_blast_version(args.makeblastdb, "makeblastdb") utils.write_log(os.path.join(group_dir, "LOG"), "STEP 4 : GROUP ORFs", vars(args), "") run_blast(args, group_dir, filter_dir, blast_dir) group_operons(args, group_dir, filter_dir, blast_dir) return
def _write_matrix_to_file(self,network_type, binary): # when completed going over all components, save the matrix utils.assure_path_exists(os.path.join(self.group_dir ,network_type + "_clusters")) with open(os.path.join(self.group_dir, network_type +".csv"), "wb") as f: writer = csv.writer(f) writer.writerows(binary) if self.save_to_ITOL: ## get the maximum value of the matrix for ITOL max_val = [item for sublist in binary for item in sublist] max_val = [x for x in max_val if isinstance(x, int)] max_val = max(max_val) utils.assure_path_exists(os.path.join(self.group_dir, "ITOL")) with open(os.path.join(self.group_dir,"ITOL",network_type + ".txt"),"wb") as f: f.write("DATASET_HEATMAP\nSEPARATOR COMMA\nDATASET_LABEL,"+network_type+"\nCOLOR,#ff0000\nFIELD_LABELS," + ",".join(binary[0][1:]) + "\nDATA\n") for i in range(1,len(binary)): f.write(",".join(map(str,binary[i]))+ "\n") strains = [row[0] for row in binary][1:] for i in range(1,len(binary[0])): feature_vec = [row[i] for row in binary] ta = feature_vec[0] feature_vec = feature_vec[1:] self._annotate_system(feature_vec,ta,strains,network_type,max_val)
def _write_toxin_output(self,label,curr_nodes, num_copies,domains,scores,toxins_lengths): # calc general properties utils.assure_path_exists(os.path.join(self.group_dir,"hits_clusters")) outfile = open(os.path.join(self.group_dir,"hits_clusters",label +".txt"),"w") outfile.write("## ID: " + label + "\n") outfile.write("## Num_Strains: " + str(num_copies) + "\n") outfile.write("## Domains: " + ",".join(domains)+ "\n") outfile.write("## Average Hit Length: " +str(np.mean(toxins_lengths))+ "\n") outfile.write("## Average HMMER score: " + str(np.mean(scores)) + "\n") outfile.write(self.sep.join(['Strain','Upstream_Cluster','Downstream_Cluster','Domain', 'HMMER_score', 'Contig', 'Strand', 'Hit_length', 'Hit_start', 'Hit_stop', 'Upstream_length', 'Upstream_delta', 'Upstream_start', 'Upstream_stop', 'Downstream_length', 'Downstream_delta', 'Downstream_start', 'Downstream_stop', 'Hit', 'Upstream', 'Downstream', 'source']) + "\n") for n in curr_nodes: hit_toks = n.split("*") hit_ID = hit_toks[0] curr_hit = self.hits[hit_ID] toks = curr_hit.toks strain = hit_ID.split("|")[0] outfile.write(self.sep.join(map(str,[strain,curr_hit.clusters["upstream"],curr_hit.clusters["downstream"],toks['Domain'].tolist()[0], toks['HMMER_score'].tolist()[0], toks['Contig'].tolist()[0], toks['Strand'].tolist()[0], toks['Hit_length'].tolist()[0], toks['Hit_start'].tolist()[0],toks['Hit_stop'].tolist()[0], toks['Upstream_length'].tolist()[0],toks['Upstream_delta'].tolist()[0], toks['Upstream_start'].tolist()[0],toks['Upstream_stop'].tolist()[0],toks['Downstream_length'].tolist()[0], toks['Downstream_delta'].tolist()[0],toks['Downstream_start'].tolist()[0],toks['Downstream_stop'].tolist()[0], toks['Hit'].tolist()[0],toks['Upstream'].tolist()[0],toks['Downstream'].tolist()[0],toks['Source'].tolist()[0]])) + "\n") outfile.close()
def run(args): args = copy.deepcopy(args) # create output directory out_dir = os.path.abspath(args.out_dir) if "prep_id" not in vars(args): prep_id = args.id elif args.prep_id is None: prep_id = args.id else: prep_id = args.prep_id prep_dir = os.path.join(out_dir, prep_id + "_PREPARE") scan_dir = os.path.join(out_dir, args.id + "_SCAN") utils.assure_path_exists(scan_dir) ## create the output directory if args.hmm_db not in utils.databases: ## check if the database exists run_hmmpress(args) else: # before starting, copy the data directory to the out direcotry copy_data(args, scan_dir) ## create list of jobs for HMMSEARCH jobs = create_jobs_list(args, prep_dir, scan_dir) ## run the pool pool = multiprocessing.Pool(args.cpu) results = pool.map_async(run_hmmer, tuple(jobs)) pool.close() pool.join() return
def report_unfit(args): if not args["report_unfit"]: return merge_unfit(args) utils.assure_path_exists(os.path.dirname(args["report_unfit"])) report = open(args["report_unfit"],"w") report.write(args["sep"].join(["Strain","Domain","HMMER_score","Contig","Strand","Hit_length","Hit_start","Hit_stop","Sequence","Reason1","Reason2"])+ "\n") for o in args["unfit_orfs"]: if args["unfit_orfs"][o].unfit: report.write(args["unfit_orfs"][o]._to_unfit(args["basename"],args["sep"]) + "\n") report.close()
def run_blast(args, group_dir, filter_dir, out_dir): ''' run blast on all hits and partners''' utils.assure_path_exists(out_dir) hits_to_fasta(args, out_dir, filter_dir) call_blast_command(args, out_dir, "hits") if args.order == "both": call_blast_command(args, out_dir, "upstream") call_blast_command(args, out_dir, "downstream") else: call_blast_command(args, out_dir, "partners") return
def run(args): args = copy.deepcopy(args) # define input and output directories args.out_dir = os.path.abspath(args.out_dir) if "prep_id" not in vars(args): prep_id = args.id elif args.prep_id is None: prep_id = args.id else: prep_id = args.prep_id if "scan_id" not in vars(args): scan_id = args.id elif args.scan_id is None: scan_id = args.id else: scan_id = args.scan_id prep_dir = os.path.join(args.out_dir, prep_id + "_PREPARE") scan_dir = os.path.join(args.out_dir, scan_id + "_SCAN") results_dir = os.path.join(args.out_dir, args.id + "_FILTER") if args.report_unfit: args.report_unfit = os.path.join(results_dir, "UNFIT") utils.assure_path_exists(args.report_unfit) # create the results directory utils.assure_path_exists(results_dir) # get the SLING data environment d = os.path.abspath(os.path.dirname(__file__)) data_env = os.path.join(d, 'data/') # get all FILTERING requirements args = vars(args) get_requirements(args, data_env) # get the profile lengths and profiles to ignore profile_lengths = parse_domains(args["hmm_db"], args["domains_file"], data_env) profiles_to_ignore = parse_domains_to_ignore(args["hmm_db"], args["domains_to_ignore"], data_env) # covert args to a dictionary args["profiles_to_ignore"] = profiles_to_ignore args["profile_lengths"] = profile_lengths jobs = create_job_list(results_dir, prep_dir, scan_dir, args) ### run all the jobs as a pool utils.run_pool(jobs, args, run_summarise) return
def _annotate_system(self,feature_vec,ta,strains,network_type,max_val): ## preset the colors to be loaded in ITOL max_colors = {"complete": "#005900","hits":"#3b1365","upstream":"#000099","partners":"#000099","downstream":"#cf4c0b","unfit":"#990000"} outdir = os.path.join(self.group_dir, "ITOL/", network_type + "_clusters") utils.assure_path_exists(outdir) out = open(os.path.join(outdir, ta + ".txt"),"w") out.write("DATASET_HEATMAP\nSEPARATOR COMMA\nDATASET_LABEL," + ta + "\nCOLOR,#ff0000\nFIELD_LABELS," + ta + "\nCOLOR_MIN,#eeeded\nCOLOR_MAX," + max_colors[network_type] + "\nUSER_MIN_VALUE,0\nUSER_MAX_VALUE," +str(max_val) +"\nDATA\n") for i in range(0,len(strains)): out.write(strains[i]+ "," + str(feature_vec[i]) + "\n") out.close()
def copy_data(args, scan_dir): ''' if database exists in SLING, copy its content to the current working directory (prevents runtime errors)''' d = os.path.abspath(os.path.dirname(__file__)) data_env = os.path.join(d, 'data/') curr_env = os.path.join(scan_dir, "data") utils.assure_path_exists(curr_env) for f in os.listdir(data_env): if args.hmm_db in f or "domains" in f: copyfile(os.path.join(data_env, f), os.path.join(curr_env, f)) args.hmm_db = os.path.join(scan_dir, 'data', args.hmm_db) return
def create_complete_files(args, group_dir, strains): ''' aggregate the hit clusters and partners clusters to report on the full operons Generate all the output files for the complete clusters''' completes = {} utils.assure_path_exists(os.path.join(group_dir, "complete_clusters")) for filename in os.listdir(os.path.join(group_dir, "hits_clusters")): curr_files = {} ## create a new file for each unique operon if not filename.endswith(".txt"): continue with open(os.path.join(group_dir, "hits_clusters", filename)) as f: toxin = filename.split(".")[0] ## get the toxin ID for line in f: if line.startswith("#"): continue toks = line.strip().split(args.sep) if line.startswith("Strain"): header = line domain_index = toks.index("Domain") continue domain = toks[domain_index] write_line_to_complete(args, group_dir, toxin, toks[1], toks[2], curr_files, line, header, completes, domain) ## close all the cluster files that were opened and copied for c_file in curr_files: curr_files[c_file].close() ## create a matrix of the results binary = [["Strain"] + map(str, range(1, 1 + len(completes)))] for strain in strains: binary.append([strain] + [0] * len(completes)) curr_column = 1 for complete in completes: binary[0][curr_column] = complete for strain in completes[complete]: row = strains.index(strain) + 1 binary[row][curr_column] += 1 curr_column += 1 ## write the matrix to a file write_matrix_to_file(args, group_dir, "complete", binary) return
def write_results(args): # create output directory if doesn't exist utils.assure_path_exists(os.path.dirname(args["out_file"])) ## write files out = open(args["out_file"], "w") if args["req_dict"]["order"] == "upstream": out.write(args["sep"].join(["Domain","HMMER_score","Contig","Strand","Hit_length","Hit_start","Hit_stop","Upstream_length", "Upstream_delta","Upstream_start","Upstream_stop", "Hit","Upstream","Source"]) + "\n") elif args["req_dict"]["order"] == "downstream": out.write(args["sep"].join(["Domain","HMMER_score","Contig","Strand","Hit_length","Hit_start","Hit_stop", "Downstream_length","Downstream_delta","Downstream_start","Downstream_stop", "Hit","Downstream","Source"]) + "\n") else: out.write(args["sep"].join(["Domain","HMMER_score","Contig","Strand","Hit_length","Hit_start","Hit_stop","Upstream_length", "Upstream_delta","Upstream_start","Upstream_stop","Downstream_length","Downstream_delta","Downstream_start","Downstream_stop", "Hit","Upstream","Downstream","Source"]) + "\n") for o in args["orfs"]: if args["orfs"][o].keep: out.write( args["orfs"][o]._to_string(args["sep"],args["req_dict"]["order"]) + "\n") out.close()
def _create_complete_files(self): completes = {} utils.assure_path_exists(os.path.join(self.group_dir ,"complete_clusters")) for file in os.listdir(os.path.join(self.group_dir,"hits_clusters")): curr_files = {} if not file.endswith(".txt"): continue with open(os.path.join(self.group_dir,"hits_clusters",file)) as f: toxin = file.split(".")[0] for line in f: if line.startswith("#"): continue if line.startswith("Strain"): header = line continue toks = line.strip().split(self.sep) domain = toks[3] if self.order == "upstream" or self.order == "downstream": self._write_line_to_complete(toxin,toks[1],curr_files,line,header,completes,domain) else: self._write_line_to_both(toxin,toks[1],toks[2],curr_files,line,header,completes,domain) for c_file in curr_files: curr_files[c_file].close() ## create a matrix of the results binary= [["Strain"] + map(str,range(1,1+ len(completes)))] for strain in self.strains: binary.append([strain] + [0] * len(completes)) curr_column = 1 for complete in completes: binary[0][curr_column] = complete for strain in completes[complete]: row = self.strains.index(strain) + 1 binary[row][curr_column] += 1 curr_column += 1 self._write_matrix_to_file("complete", binary)
def _write_antitoxin_output(self,network_type,label,curr_nodes,num_copies,domains,scores,toxins_lengths,directions,antitoxins_lengths,deltas): utils.assure_path_exists(os.path.join(self.group_dir,network_type+"_clusters")) outfile = open(os.path.join(self.group_dir,network_type+"_clusters",label +".txt"),"w") outfile.write("## ID: " + label + "\n") outfile.write("## Num Copies: " + str(num_copies) + "\n") outfile.write("## Domains: " + ",".join(domains)+ "\n") outfile.write("## Average Hit Length: " +str(np.mean(toxins_lengths))+ "\n") outfile.write("## Average HMMER score: " + str(np.mean(scores)) + "\n") if network_type != "hits": outfile.write("## Average Delta: " +str(np.mean(deltas))+ "\n") outfile.write("## Average Partner Length: " +str(np.mean(antitoxins_lengths))+ "\n") if self.order == "either": outfile.write("## Order: Upstream: " + str(directions["standard"]) + " Downstream: " + str(directions["reverse"]) + "\n") ## add information on each of the hits seperately outfile.write(self.sep.join(["Strain","Hit_Cluster","Domain","HMMER_score","Order","Contig","Strand","Partner_Length", "Delta","Partner_Start","Partner_Stop","Hit_Length","Hit_Start","Hit_Stop","Partner","Sequence","Source"])+"\n") for n in curr_nodes: hit_toks = n.split("*") hit_ID = hit_toks[0] curr_hit = self.hits[hit_ID] toks = curr_hit.toks strain = hit_ID.split("|")[0] cluster = curr_hit.clusters["hit"] if hit_toks[1] == "upstream" or self.order == "upstream": if self.order == "upstream": cluster = curr_hit.clusters["upstream"] outfile.write(self.sep.join(map(str,[strain, cluster,toks['Domain'].tolist()[0],toks['HMMER_score'].tolist()[0],"upstream", toks['Contig'].tolist()[0],toks['Strand'].tolist()[0],toks['Upstream_length'].tolist()[0],toks['Upstream_delta'].tolist()[0],toks['Upstream_start'].tolist()[0], toks['Upstream_stop'].tolist()[0],toks['Hit_length'].tolist()[0],toks['Hit_start'].tolist()[0],toks['Hit_stop'].tolist()[0],toks['Upstream'].tolist()[0], toks['Hit'].tolist()[0],toks['Source'].tolist()[0]])) + "\n") else: if self.order == "downstream": cluster = curr_hit.clusters["downstream"] outfile.write(self.sep.join(map(str,[strain, cluster,toks['Domain'].tolist()[0],toks['HMMER_score'].tolist()[0],"downstream", toks['Contig'].tolist()[0],toks['Strand'].tolist()[0],toks['Downstream_length'].tolist()[0],toks['Downstream_delta'].tolist()[0],toks['Downstream_start'].tolist()[0], toks['Downstream_stop'].tolist()[0],toks['Hit_length'].tolist()[0],toks['Hit_start'].tolist()[0],toks['Hit_stop'].tolist()[0],toks['Downstream'].tolist()[0], toks['Hit'].tolist()[0],toks['Source'].tolist()[0]])) + "\n") outfile.close()
def _write_unfit_output(self, label, curr_nodes, num_copies, domains, scores, lengths, reasons): utils.assure_path_exists(os.path.join(self.group_dir,"unfit_clusters")) outfile = open(os.path.join(self.group_dir,"unfit_clusters" ,label +".txt"),"w") outfile.write("## ID: " + label + "\n") outfile.write("## Num Copies: " + str(num_copies) + "\n") outfile.write("## Domains: " + ",".join(domains)+ "\n") outfile.write("## Average Hit Length: " +str(np.mean(lengths))+ "\n") outfile.write("## Average HMMER score: " + str(np.mean(scores)) + "\n") outfile.write("## Reasons: " + ",".join(map(str,list(reasons))) + "\n") ## add information on each of the hits seperately outfile.write(self.sep.join(["Strain","Unfit_Cluster","Hit_Clusters","Domain","HMMER_score","Contig","Strand","Hit_Length","Hit_Start","Hit_Stop","Sequence","Reason1","Reason2"])+"\n") for n in curr_nodes: hit_toks = n.split("*") hit_ID = hit_toks[0] curr_hit = self.unfits[hit_ID] toks = curr_hit.toks strain = hit_ID.split("|")[0] unfit_cluster = curr_hit.clusters["unfit"] hit_clusters = curr_hit.clusters["hit"] outfile.write(self.sep.join(map(str,[strain, unfit_cluster, hit_clusters ,toks['Domain'].tolist()[0],toks['HMMER_score'].tolist()[0], toks['Contig'].tolist()[0],toks['Strand'].tolist()[0], toks['Hit_length'].tolist()[0],toks['Hit_start'].tolist()[0],toks['Hit_stop'].tolist()[0], toks['Sequence'].tolist()[0],toks['Reason1'].tolist()[0], toks['Reason2'].tolist()[0]] )) + "\n") outfile.close()
def write_single_output(args, network_type, keys, hits, group_dir, label, curr_nodes, num_copies, domains, scores, toxins_lengths, antitoxins_lengths, deltas, directions, reasons): ''' create the output file for a toxin cluster Slightly messy because the outputs are different, depending if its a toxin, antitoxin or both...''' utils.assure_path_exists( os.path.join(group_dir, network_type + "_clusters")) outfile = open( os.path.join(group_dir, network_type + "_clusters", label + ".txt"), "w") ### Attributes of this cluster #### outfile.write("# ID: " + label + "\n") outfile.write("# Num copies: " + str(num_copies) + "\n") outfile.write("# Domains: " + ",".join(domains) + "\n") outfile.write("# Average Hit Length: " + str(np.mean(toxins_lengths)) + "\n") outfile.write("# Average HMMER score: " + str(np.mean(scores)) + "\n") if network_type in ["partners", "upstream", "downstream"]: outfile.write("## Average Delta: " + str(np.mean(deltas)) + "\n") outfile.write("## Average Partner Length: " + str(np.mean(antitoxins_lengths)) + "\n") if args.order == "either" and network_type != "unfit": outfile.write("# Order: Upstream: " + str(directions["standard"]) + " Downstream: " + str(directions["reverse"]) + "\n") if network_type == "unfit": outfile.write("# Reasons: " + ",".join(map(str, list(reasons))) + "\n") ### HEADER ### if network_type == "hits": if args.order in ["either", "both"]: outfile.write( args.sep.join( ['Strain', 'Upstream_Cluster', 'Downstream_Cluster'] + keys) + "\n") else: ## either upstream or downstream ## only upstream antitoxin outfile.write( args.sep.join(['Strain', 'Partner_Cluster'] + keys) + "\n") elif network_type == "unfit": outfile.write(args.sep.join(['Strain', 'Hit_Cluster'] + keys) + "\n") else: # network_type in ["partners", "upstream", "downstream"]: ## antitoxins output ## only antitoxins outfile.write( args.sep.join([ "Strain", "Hit_Cluster", "Domain", "HMMER_score", "Order", "Contig", "Strand", "Partner_Length", "Delta", "Partner_Start", "Partner_Stop", "Hit_Length", "Hit_Start", "Hit_Stop", "Partner", "Sequence", "Source" ]) + "\n") ### one line per member of this cluster ### for n in curr_nodes: hit_toks = n.split("*") hit_type = hit_toks[1] hit_ID = hit_toks[0] curr_hit = hits[hit_ID] strain = hit_ID.split("|")[0] if hit_type == "hit": ## toxins if args.order in ["either", "both"]: outfile.write( args.sep.join([ strain, curr_hit["clusters"]["upstream"], curr_hit["clusters"]["downstream"] ])) elif args.order == "upstream": outfile.write( args.sep.join([strain, curr_hit["clusters"]["upstream"]])) else: outfile.write( args.sep.join([strain, curr_hit["clusters"]["downstream"]])) for k in keys: outfile.write("," + str(curr_hit[k])) outfile.write("\n") elif hit_type == "unfit": hit_clusters = "+".join(list(curr_hit["clusters"]["hit"])) outfile.write(args.sep.join([strain, hit_clusters])) for k in keys: outfile.write("," + str(curr_hit[k])) outfile.write("\n") else: ## antitoxin cluster = curr_hit["clusters"]["hit"] if hit_type == "upstream": outfile.write( args.sep.join( map(str, [ strain, cluster, curr_hit['Domain'], curr_hit['HMMER_score'], "upstream", curr_hit['Contig'], curr_hit['Strand'], curr_hit['Upstream_length'], curr_hit['Upstream_delta'], curr_hit['Upstream_start'], curr_hit['Upstream_stop'], curr_hit['Hit_length'], curr_hit['Hit_start'], curr_hit['Hit_stop'], curr_hit['Upstream'], curr_hit['Hit'], curr_hit['Source'] ])) + "\n") else: ## Downstream outfile.write( args.sep.join( map(str, [ strain, cluster, curr_hit['Domain'], curr_hit['HMMER_score'], "downstream", curr_hit['Contig'], curr_hit['Strand'], curr_hit['Downstream_length'], curr_hit['Downstream_delta'], curr_hit['Downstream_start'], curr_hit['Downstream_stop'], curr_hit['Hit_length'], curr_hit['Hit_start'], curr_hit['Hit_stop'], curr_hit['Downstream'], curr_hit['Hit'], curr_hit['Source'] ])) + "\n") outfile.close() return
def _hits_to_fasta( self ): # combine all the hits and write them into a single fasta file to run blast utils.assure_path_exists(self.out_dir) hits = open(os.path.join(self.out_dir, "hits.fasta"), "w") if self.order == "both": downstream = open(os.path.join(self.out_dir, "downstream.fasta"), "w") upstream = open(os.path.join(self.out_dir, "upstream.fasta"), "w") else: partners = open(os.path.join(self.out_dir, "partners.fasta"), "w") for file in os.listdir(self.results_dir): if not file.endswith(".csv"): continue strain = os.path.basename(file) strain = strain.replace(".csv", "") with open(os.path.join(self.results_dir, file)) as f: line_num = 1 for line in f: if line.startswith("Domain"): continue toks = line.strip().split(self.sep) identifier = ">" + strain + "|" + str(line_num) if self.order == "either" or self.order == "both": hits.write(identifier + "*hit" + "\n" + toks[15] + "\n") if self.order == "either": if toks[16] != "": partners.write(identifier + "*upstream" + "\n" + toks[16] + "\n") if toks[17] != "": partners.write(identifier + "*downstream" + "\n" + toks[17] + "\n") else: upstream.write(identifier + "*upstream" + "\n" + toks[16] + "\n") downstream.write(identifier + "*downstream" + "\n" + toks[17] + "\n") else: hits.write(identifier + "*hit" + "\n" + toks[11] + "\n") partners.write(identifier + "*" + self.order + "\n" + toks[12] + "\n") line_num += 1 ## if in the previous step, the unfit were also reported, add them to the "hits" in the network analysis if self.report_unfit and not os.path.exists( os.path.join(self.results_dir, "UNFIT")): warnings.warn( "Could not find UNFIT files from SUMMARISE step. To report unfit, turn on --report_unfit / -u flag in FILTER and run again." ) if os.path.exists(os.path.join(self.results_dir, "UNFIT")): for file in os.listdir(os.path.join(self.results_dir, "UNFIT")): if not file.endswith(".csv"): continue with open(os.path.join(self.results_dir, "UNFIT", file)) as f: line_num = 1 for line in f: if line.startswith("Strain"): continue toks = line.strip().split(",") strain = toks[0] identifier = ">" + strain + "|" + str( line_num) + "*unfit" hits.write(identifier + "\n" + toks[8] + "\n") line_num += 1 hits.close() if self.order == "both": upstream.close() downstream.close() else: partners.close()
def run(self): ## create output directory utils.assure_path_exists(self.args["scan_dir"]) if self.args["hmm_db"] not in utils.databases: self._run_hmmpress() else:## before starting, copy the data directory to the out direcotry d = os.path.abspath(os.path.dirname(__file__)) data_env = os.path.join(d, 'data/') os.system("mkdir -p " + os.path.join(self.args["scan_dir"],"data")) ## Do both??? -> this seems to choose how to behave arbitrarily os.system("cp -r " + data_env + " " + os.path.join(self.args["scan_dir"])) os.system("cp -r " + data_env + " " + os.path.join(self.args["scan_dir"],"data")) self.args["hmm_db"] = os.path.join(self.args["scan_dir"],'data',self.args["hmm_db"]) ## get version of hmmscan for log file log_other = self._get_hmmer_version("hmmscan") log_other = log_other + self._get_hmmer_version("hmmpress") log_other = log_other + "### INPUT ### \ncnt\tgenome\tfasta_file\tgff_file\n" # keeping a text file of all the genomes used jobs = [] cnt = 1 for file in os.listdir(self.args["prep_dir"]): if file.endswith(".sixframe.fasta"): basename = os.path.basename(file) basename = basename.replace(".sixframe.fasta","") sixframe_file = os.path.join(self.args["prep_dir"], basename + ".sixframe.fasta") annotated_file = os.path.join(self.args["prep_dir"], basename + ".annotated.fasta") scan_genome = {"basename" :basename, "source": "sixframe", "fasta_file": sixframe_file, "out_dir": self.args["scan_dir"], "hmm_db": self.args["hmm_db"], "hmmscan": self.args["configs"]["hmmscan"]} jobs.append(scan_genome) if os.path.isfile(annotated_file): scan_genome = {"basename" :basename, "source": "annotated", "fasta_file": annotated_file, "out_dir": self.args["scan_dir"], "hmm_db": self.args["hmm_db"], "hmmscan": self.args["configs"]["hmmscan"]} jobs.append(scan_genome) log_other = log_other + str(cnt) +"\t" + basename +"\t"+ sixframe_file +"\t"+ annotated_file+"\n" else: log_other = log_other + str(cnt) +"\t" + basename +"\t"+ sixframe_file +"\tnot found\n" cnt += 1 utils.write_log(os.path.join(self.args["scan_dir"], "LOG"), "STEP 2 : GENOME SCANNING", self.args, log_other) ## Allocated CPUs in an efficient way CPUs_per_scan = 5 ## if there are more CPUs than jobs, or we have less than 5 CPUs, don't use multiprocessing, just give ## the maximum number of CPUs to each hmmscan call if self.args["cpu"] > len(jobs) or self.args["cpu"] < CPUs_per_scan: pool_size = 1 CPUs_per_scan = self.args["cpu"] ## otherwise, create pool of size CPUS/5, and each scan will run on 5 CPUs (what to do with CPUs I lose?) else: pool_size = self.args["cpu"] / CPUs_per_scan for j in jobs: j["scan_cpu"] = CPUs_per_scan pool = multiprocessing.Pool(processes = pool_size) results = pool.map_async(run_scan,tuple(jobs)) pool.close() pool.join()