コード例 #1
0
	def __init__(self, 
		filter_id,
		group_id,
		req,
		out_dir=".",
		order = None,
		min_identity = 30,
		min_blast_evalue = 0.01,
		save_to_ITOL = False,
		sep = ",",
		report_unfit = False):

		self.req = req
		self.order = order
		self.filter_id = filter_id
		self.group_id = group_id
		self.out_dir = os.path.abspath(out_dir)
		self.min_identity = min_identity
		self.min_blast_evalue = min_blast_evalue
		self.save_to_ITOL = save_to_ITOL
		self.sep = sep
		self.report_unfit = report_unfit

		if self.order == None: ## user didn't override the order argument
			d = os.path.abspath(os.path.dirname(__file__))
			data_env = os.path.join(d, 'data/')
			self.order = utils.get_order(self.req,data_env)

		## get blast versions
		self.blastp = self._get_blast_version("blastp")
		self.makeblastdb = self._get_blast_version("makeblastdb")

		utils.assure_path_exists(os.path.join(self.out_dir,group_id+ "_GROUP"))
		utils.write_log(os.path.join(self.out_dir,group_id + "_GROUP", "LOG"), "STEP 4 : GROUP ORFs", vars(self), "")
コード例 #2
0
def main():
    gpu = '0'  # which GPU to use for experiment

    data_path = './cub200_resnet'  # path to data
    save_path = './results'  # path to save results
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
    utils.assure_path_exists(save_path)

    dataset = 'cub200'
    capacity_list = utils.get_dataset_params(dataset)
    experiment_types = ['iid', 'class_iid']

    # loop over all experiment types
    for experiment_type in experiment_types:
        print('Experiment Type: ', experiment_type)

        # load and sort data
        X_train, y_train, X_test, y_test = utils.load_cub200(
            data_path, experiment_type)

        # loop over desired capacities
        for c in capacity_list:
            print('Capacity: ', c)
            acc, mpca = experiment(dataset, X_train, y_train, X_test, y_test,
                                   c)

            # save results out
            acc_name = 'acc_' + experiment_type + '_' + dataset + '_exstream_capacity_' + str(
                c)
            mpca_name = 'mpca_' + acc_name
            np.save(save_path + '/' + acc_name, np.asarray(acc))
            np.save(save_path + '/' + mpca_name, np.asarray(mpca))
コード例 #3
0
def write_matrix_to_file(args, group_dir, network_type, binary):
    ''' write the matrix to a file, both as a CSV
    and for reading into ITOL if asked'''
    with open(os.path.join(group_dir, network_type + ".csv"), "wb") as f:
        writer = csv.writer(f)
        writer.writerows(binary)

    if args.itol:
        ## get the maximum value of the matrix for ITOL
        max_val = [item for sublist in binary for item in sublist]
        max_val = [x for x in max_val if isinstance(x, int)]
        max_val = max(max_val)
        utils.assure_path_exists(os.path.join(group_dir, "ITOL"))
        with open(os.path.join(group_dir, "ITOL", network_type + ".txt"),
                  "wb") as f:
            f.write("DATASET_HEATMAP\nSEPARATOR COMMA\nDATASET_LABEL," +
                    network_type + "\nCOLOR,#ff0000\nFIELD_LABELS," +
                    ",".join(binary[0][1:]) + "\nDATA\n")
            for i in range(1, len(binary)):
                f.write(",".join(map(str, binary[i])) + "\n")
        strains = [row[0] for row in binary][1:]
        for i in range(1, len(binary[0])):
            feature_vec = [row[i] for row in binary]
            ta = feature_vec[0]
            feature_vec = feature_vec[1:]
            generate_single_itol_output(
                group_dir, feature_vec, ta, strains, network_type,
                max_val)  ## create a single file for one system
    return
コード例 #4
0
def run(args):
    args = copy.deepcopy(args)
    if "filter_id" not in vars(args):
        filter_id = args.id
    elif args.filter_id is None:
        filter_id = args.id
    else:
        filter_id = args.filter_id

    args.out_dir = os.path.abspath(args.out_dir)
    filter_dir = os.path.join(args.out_dir, filter_id + "_FILTER")
    group_dir = os.path.join(args.out_dir, args.id + "_GROUP")
    utils.assure_path_exists(group_dir)
    blast_dir = os.path.join(group_dir, "blast_files")

    if args.order is None:  ## user didn't override the order argument
        d = os.path.abspath(os.path.dirname(__file__))
        data_env = os.path.join(d, 'data/')
        args.order = utils.get_order(args.hmm_db, data_env)

    ## write LOG file
    get_blast_version(args.blastp, "blastp")
    get_blast_version(args.makeblastdb, "makeblastdb")
    utils.write_log(os.path.join(group_dir, "LOG"), "STEP 4 : GROUP ORFs",
                    vars(args), "")

    run_blast(args, group_dir, filter_dir, blast_dir)
    group_operons(args, group_dir, filter_dir, blast_dir)
    return
コード例 #5
0
ファイル: group_operons.py プロジェクト: ghtester123/sling
	def _write_matrix_to_file(self,network_type, binary):
		# when completed going over all components, save the matrix
		utils.assure_path_exists(os.path.join(self.group_dir ,network_type + "_clusters"))
		
		with open(os.path.join(self.group_dir, network_type +".csv"), "wb") as f:
			writer = csv.writer(f)
			writer.writerows(binary)
		
		if self.save_to_ITOL:
			
			## get the maximum value of the matrix for ITOL
			max_val = [item for sublist in binary for item in sublist]
			max_val = [x for x in max_val if isinstance(x, int)]
			max_val = max(max_val)

			utils.assure_path_exists(os.path.join(self.group_dir, "ITOL"))
			
			with open(os.path.join(self.group_dir,"ITOL",network_type + ".txt"),"wb") as f:
				f.write("DATASET_HEATMAP\nSEPARATOR COMMA\nDATASET_LABEL,"+network_type+"\nCOLOR,#ff0000\nFIELD_LABELS," + ",".join(binary[0][1:]) + "\nDATA\n")
				for i in range(1,len(binary)):
					f.write(",".join(map(str,binary[i]))+ "\n")
			strains = [row[0] for row in binary][1:]
			for i in range(1,len(binary[0])):
				feature_vec = [row[i] for row in binary]
				ta = feature_vec[0]
				feature_vec = feature_vec[1:]
				self._annotate_system(feature_vec,ta,strains,network_type,max_val)
コード例 #6
0
ファイル: group_operons.py プロジェクト: ghtester123/sling
	def _write_toxin_output(self,label,curr_nodes, num_copies,domains,scores,toxins_lengths):
		# calc general properties
		utils.assure_path_exists(os.path.join(self.group_dir,"hits_clusters"))
		outfile = open(os.path.join(self.group_dir,"hits_clusters",label +".txt"),"w")
		outfile.write("##  ID: " + label + "\n")
		outfile.write("##  Num_Strains: " + str(num_copies) + "\n")
		outfile.write("##  Domains: " + ",".join(domains)+ "\n")	
		outfile.write("##  Average Hit Length: " +str(np.mean(toxins_lengths))+ "\n")
		outfile.write("##  Average HMMER score: " + str(np.mean(scores)) + "\n")

		outfile.write(self.sep.join(['Strain','Upstream_Cluster','Downstream_Cluster','Domain', 'HMMER_score', 'Contig', 'Strand',
		 'Hit_length', 'Hit_start', 'Hit_stop', 'Upstream_length', 'Upstream_delta', 'Upstream_start', 
			'Upstream_stop', 'Downstream_length', 'Downstream_delta', 'Downstream_start', 
			'Downstream_stop', 'Hit', 'Upstream', 'Downstream', 'source']) + "\n")
		for n in curr_nodes:
			hit_toks = n.split("*")
			hit_ID = hit_toks[0]
			curr_hit = self.hits[hit_ID]
			toks = curr_hit.toks
			strain = hit_ID.split("|")[0]
			outfile.write(self.sep.join(map(str,[strain,curr_hit.clusters["upstream"],curr_hit.clusters["downstream"],toks['Domain'].tolist()[0], toks['HMMER_score'].tolist()[0],
				toks['Contig'].tolist()[0], toks['Strand'].tolist()[0], toks['Hit_length'].tolist()[0], toks['Hit_start'].tolist()[0],toks['Hit_stop'].tolist()[0],
				toks['Upstream_length'].tolist()[0],toks['Upstream_delta'].tolist()[0],
				toks['Upstream_start'].tolist()[0],toks['Upstream_stop'].tolist()[0],toks['Downstream_length'].tolist()[0],
				toks['Downstream_delta'].tolist()[0],toks['Downstream_start'].tolist()[0],toks['Downstream_stop'].tolist()[0],
				toks['Hit'].tolist()[0],toks['Upstream'].tolist()[0],toks['Downstream'].tolist()[0],toks['Source'].tolist()[0]])) + "\n")
		outfile.close()
コード例 #7
0
ファイル: scan.py プロジェクト: ghoresh11/sling
def run(args):
    args = copy.deepcopy(args)
    # create output directory
    out_dir = os.path.abspath(args.out_dir)

    if "prep_id" not in vars(args):
        prep_id = args.id
    elif args.prep_id is None:
        prep_id = args.id
    else:
        prep_id = args.prep_id

    prep_dir = os.path.join(out_dir, prep_id + "_PREPARE")

    scan_dir = os.path.join(out_dir, args.id + "_SCAN")

    utils.assure_path_exists(scan_dir)  ## create the output directory

    if args.hmm_db not in utils.databases:  ## check if the database exists
        run_hmmpress(args)
    else:  # before starting, copy the data directory to the out direcotry
        copy_data(args, scan_dir)

    ## create list of jobs for HMMSEARCH
    jobs = create_jobs_list(args, prep_dir, scan_dir)
    ## run the pool
    pool = multiprocessing.Pool(args.cpu)
    results = pool.map_async(run_hmmer, tuple(jobs))
    pool.close()
    pool.join()
    return
コード例 #8
0
ファイル: filter.py プロジェクト: ghtester123/sling
def report_unfit(args):
	if not args["report_unfit"]:
		return
	merge_unfit(args)
	utils.assure_path_exists(os.path.dirname(args["report_unfit"]))
	report = open(args["report_unfit"],"w")
	report.write(args["sep"].join(["Strain","Domain","HMMER_score","Contig","Strand","Hit_length","Hit_start","Hit_stop","Sequence","Reason1","Reason2"])+ "\n")

	for o in args["unfit_orfs"]:
		if args["unfit_orfs"][o].unfit:
			report.write(args["unfit_orfs"][o]._to_unfit(args["basename"],args["sep"]) + "\n")
	report.close()
コード例 #9
0
def run_blast(args, group_dir, filter_dir, out_dir):
    ''' run blast on all hits and partners'''
    utils.assure_path_exists(out_dir)
    hits_to_fasta(args, out_dir, filter_dir)

    call_blast_command(args, out_dir, "hits")
    if args.order == "both":
        call_blast_command(args, out_dir, "upstream")
        call_blast_command(args, out_dir, "downstream")
    else:
        call_blast_command(args, out_dir, "partners")
    return
コード例 #10
0
ファイル: filter.py プロジェクト: ghoresh11/sling
def run(args):
    args = copy.deepcopy(args)
    # define input and output directories
    args.out_dir = os.path.abspath(args.out_dir)

    if "prep_id" not in vars(args):
        prep_id = args.id
    elif args.prep_id is None:
        prep_id = args.id
    else:
        prep_id = args.prep_id

    if "scan_id" not in vars(args):
        scan_id = args.id
    elif args.scan_id is None:
        scan_id = args.id
    else:
        scan_id = args.scan_id

    prep_dir = os.path.join(args.out_dir, prep_id + "_PREPARE")
    scan_dir = os.path.join(args.out_dir, scan_id + "_SCAN")
    results_dir = os.path.join(args.out_dir, args.id + "_FILTER")

    if args.report_unfit:
        args.report_unfit = os.path.join(results_dir, "UNFIT")
        utils.assure_path_exists(args.report_unfit)

    # create the results directory
    utils.assure_path_exists(results_dir)

    # get the SLING data environment
    d = os.path.abspath(os.path.dirname(__file__))
    data_env = os.path.join(d, 'data/')

    # get all FILTERING requirements
    args = vars(args)
    get_requirements(args, data_env)

    # get the profile lengths and profiles to ignore
    profile_lengths = parse_domains(args["hmm_db"], args["domains_file"],
                                    data_env)
    profiles_to_ignore = parse_domains_to_ignore(args["hmm_db"],
                                                 args["domains_to_ignore"],
                                                 data_env)

    # covert args to a dictionary
    args["profiles_to_ignore"] = profiles_to_ignore
    args["profile_lengths"] = profile_lengths

    jobs = create_job_list(results_dir, prep_dir, scan_dir, args)
    ### run all the jobs as a pool
    utils.run_pool(jobs, args, run_summarise)
    return
コード例 #11
0
ファイル: group_operons.py プロジェクト: ghtester123/sling
	def _annotate_system(self,feature_vec,ta,strains,network_type,max_val):

		## preset the colors to be loaded in ITOL
		max_colors = {"complete": "#005900","hits":"#3b1365","upstream":"#000099","partners":"#000099","downstream":"#cf4c0b","unfit":"#990000"}

		outdir = os.path.join(self.group_dir, "ITOL/", network_type + "_clusters")
		utils.assure_path_exists(outdir)
		out = open(os.path.join(outdir, ta  + ".txt"),"w")

		out.write("DATASET_HEATMAP\nSEPARATOR COMMA\nDATASET_LABEL," + ta + "\nCOLOR,#ff0000\nFIELD_LABELS," + ta +
			"\nCOLOR_MIN,#eeeded\nCOLOR_MAX," + max_colors[network_type] + "\nUSER_MIN_VALUE,0\nUSER_MAX_VALUE," +str(max_val) +"\nDATA\n")
		for i in range(0,len(strains)):
			out.write(strains[i]+ "," + str(feature_vec[i]) + "\n") 
		out.close()
コード例 #12
0
ファイル: scan.py プロジェクト: ghoresh11/sling
def copy_data(args, scan_dir):
    ''' if database exists in SLING, copy its content to the
    current working directory (prevents runtime errors)'''
    d = os.path.abspath(os.path.dirname(__file__))
    data_env = os.path.join(d, 'data/')
    curr_env = os.path.join(scan_dir, "data")
    utils.assure_path_exists(curr_env)

    for f in os.listdir(data_env):
        if args.hmm_db in f or "domains" in f:
            copyfile(os.path.join(data_env, f), os.path.join(curr_env, f))

    args.hmm_db = os.path.join(scan_dir, 'data', args.hmm_db)
    return
コード例 #13
0
def create_complete_files(args, group_dir, strains):
    ''' aggregate the hit clusters and partners clusters
    to report on the full operons
    Generate all the output files for the complete clusters'''
    completes = {}
    utils.assure_path_exists(os.path.join(group_dir, "complete_clusters"))
    for filename in os.listdir(os.path.join(group_dir, "hits_clusters")):
        curr_files = {}  ## create a new file for each unique operon
        if not filename.endswith(".txt"):
            continue
        with open(os.path.join(group_dir, "hits_clusters", filename)) as f:
            toxin = filename.split(".")[0]  ## get the toxin ID
            for line in f:
                if line.startswith("#"):
                    continue
                toks = line.strip().split(args.sep)
                if line.startswith("Strain"):
                    header = line
                    domain_index = toks.index("Domain")
                    continue
                domain = toks[domain_index]
                write_line_to_complete(args, group_dir, toxin, toks[1],
                                       toks[2], curr_files, line, header,
                                       completes, domain)

    ## close all the cluster files that were opened and copied
    for c_file in curr_files:
        curr_files[c_file].close()
    ## create a matrix of the results
    binary = [["Strain"] + map(str, range(1, 1 + len(completes)))]
    for strain in strains:
        binary.append([strain] + [0] * len(completes))
    curr_column = 1
    for complete in completes:
        binary[0][curr_column] = complete
        for strain in completes[complete]:
            row = strains.index(strain) + 1
            binary[row][curr_column] += 1
        curr_column += 1

    ## write the matrix to a file
    write_matrix_to_file(args, group_dir, "complete", binary)
    return
コード例 #14
0
ファイル: filter.py プロジェクト: ghtester123/sling
def write_results(args):
	# create output directory if doesn't exist
	utils.assure_path_exists(os.path.dirname(args["out_file"]))

	## write files
	out = open(args["out_file"], "w")
	if args["req_dict"]["order"] == "upstream":
		out.write(args["sep"].join(["Domain","HMMER_score","Contig","Strand","Hit_length","Hit_start","Hit_stop","Upstream_length",
			"Upstream_delta","Upstream_start","Upstream_stop",
			"Hit","Upstream","Source"]) + "\n")
	elif args["req_dict"]["order"] == "downstream":
		out.write(args["sep"].join(["Domain","HMMER_score","Contig","Strand","Hit_length","Hit_start","Hit_stop",
			"Downstream_length","Downstream_delta","Downstream_start","Downstream_stop",
			"Hit","Downstream","Source"]) + "\n")
	else:
		out.write(args["sep"].join(["Domain","HMMER_score","Contig","Strand","Hit_length","Hit_start","Hit_stop","Upstream_length",
			"Upstream_delta","Upstream_start","Upstream_stop","Downstream_length","Downstream_delta","Downstream_start","Downstream_stop",
			"Hit","Upstream","Downstream","Source"]) + "\n")
	for o in args["orfs"]:
		if args["orfs"][o].keep:
			out.write( args["orfs"][o]._to_string(args["sep"],args["req_dict"]["order"]) + "\n")
	out.close()
コード例 #15
0
ファイル: group_operons.py プロジェクト: ghtester123/sling
	def _create_complete_files(self):
		completes = {}
		utils.assure_path_exists(os.path.join(self.group_dir ,"complete_clusters"))
		for file in os.listdir(os.path.join(self.group_dir,"hits_clusters")):
			curr_files = {}
			if not file.endswith(".txt"):
				continue
			with open(os.path.join(self.group_dir,"hits_clusters",file)) as f:
				toxin = file.split(".")[0]
				for line in f:
					if line.startswith("#"):
						continue
					if line.startswith("Strain"):
						header = line
						continue
					toks = line.strip().split(self.sep)
					domain = toks[3]
					if self.order == "upstream" or self.order == "downstream":
						self._write_line_to_complete(toxin,toks[1],curr_files,line,header,completes,domain)
					else:
						self._write_line_to_both(toxin,toks[1],toks[2],curr_files,line,header,completes,domain)

			for c_file in curr_files:
				curr_files[c_file].close()

		## create a matrix of the results
		binary= [["Strain"] + map(str,range(1,1+ len(completes)))]
		for strain in self.strains:
			binary.append([strain] + [0] * len(completes))
		curr_column = 1
		for complete in completes:
			binary[0][curr_column] = complete
			for strain in completes[complete]:
				row = self.strains.index(strain) + 1
				binary[row][curr_column] += 1
			curr_column += 1

		self._write_matrix_to_file("complete", binary)
コード例 #16
0
ファイル: group_operons.py プロジェクト: ghtester123/sling
	def _write_antitoxin_output(self,network_type,label,curr_nodes,num_copies,domains,scores,toxins_lengths,directions,antitoxins_lengths,deltas):
		utils.assure_path_exists(os.path.join(self.group_dir,network_type+"_clusters"))
		outfile = open(os.path.join(self.group_dir,network_type+"_clusters",label +".txt"),"w")
		outfile.write("##  ID: " + label + "\n")
		outfile.write("##  Num Copies: " + str(num_copies) + "\n")
		outfile.write("##  Domains: " + ",".join(domains)+ "\n")	
		outfile.write("##  Average Hit Length: " +str(np.mean(toxins_lengths))+ "\n")
		outfile.write("##  Average HMMER score: " + str(np.mean(scores)) + "\n")
		if network_type != "hits":
			outfile.write("##  Average Delta: " +str(np.mean(deltas))+ "\n")
			outfile.write("##  Average Partner Length: " +str(np.mean(antitoxins_lengths))+ "\n")
		if self.order == "either":
			outfile.write("##  Order: Upstream: " + str(directions["standard"]) + " Downstream: " + str(directions["reverse"]) + "\n")
		## add information on each of the hits seperately
		outfile.write(self.sep.join(["Strain","Hit_Cluster","Domain","HMMER_score","Order","Contig","Strand","Partner_Length",
			"Delta","Partner_Start","Partner_Stop","Hit_Length","Hit_Start","Hit_Stop","Partner","Sequence","Source"])+"\n")
		for n in curr_nodes:
			hit_toks = n.split("*")
			hit_ID = hit_toks[0]
			curr_hit = self.hits[hit_ID]
			toks = curr_hit.toks
			strain = hit_ID.split("|")[0]
			cluster = curr_hit.clusters["hit"]
			if hit_toks[1] == "upstream" or self.order == "upstream":
				if self.order == "upstream":
					cluster =  curr_hit.clusters["upstream"]
				outfile.write(self.sep.join(map(str,[strain, cluster,toks['Domain'].tolist()[0],toks['HMMER_score'].tolist()[0],"upstream",
					toks['Contig'].tolist()[0],toks['Strand'].tolist()[0],toks['Upstream_length'].tolist()[0],toks['Upstream_delta'].tolist()[0],toks['Upstream_start'].tolist()[0],
					toks['Upstream_stop'].tolist()[0],toks['Hit_length'].tolist()[0],toks['Hit_start'].tolist()[0],toks['Hit_stop'].tolist()[0],toks['Upstream'].tolist()[0],
					toks['Hit'].tolist()[0],toks['Source'].tolist()[0]])) + "\n")
			else:
				if self.order == "downstream":
					cluster =  curr_hit.clusters["downstream"]
				outfile.write(self.sep.join(map(str,[strain, cluster,toks['Domain'].tolist()[0],toks['HMMER_score'].tolist()[0],"downstream",
					toks['Contig'].tolist()[0],toks['Strand'].tolist()[0],toks['Downstream_length'].tolist()[0],toks['Downstream_delta'].tolist()[0],toks['Downstream_start'].tolist()[0],
					toks['Downstream_stop'].tolist()[0],toks['Hit_length'].tolist()[0],toks['Hit_start'].tolist()[0],toks['Hit_stop'].tolist()[0],toks['Downstream'].tolist()[0],
					toks['Hit'].tolist()[0],toks['Source'].tolist()[0]])) + "\n")
		outfile.close()
コード例 #17
0
ファイル: group_operons.py プロジェクト: ghtester123/sling
	def _write_unfit_output(self, label, curr_nodes, num_copies, domains, scores, lengths, reasons):

		utils.assure_path_exists(os.path.join(self.group_dir,"unfit_clusters"))
		outfile = open(os.path.join(self.group_dir,"unfit_clusters" ,label +".txt"),"w")
		outfile.write("##  ID: " + label + "\n")
		outfile.write("##  Num Copies: " + str(num_copies) + "\n")
		outfile.write("##  Domains: " + ",".join(domains)+ "\n")	
		outfile.write("##  Average Hit Length: " +str(np.mean(lengths))+ "\n")
		outfile.write("##  Average HMMER score: " + str(np.mean(scores)) + "\n")
		outfile.write("## Reasons: " + ",".join(map(str,list(reasons))) + "\n")
		## add information on each of the hits seperately
		outfile.write(self.sep.join(["Strain","Unfit_Cluster","Hit_Clusters","Domain","HMMER_score","Contig","Strand","Hit_Length","Hit_Start","Hit_Stop","Sequence","Reason1","Reason2"])+"\n")
		for n in curr_nodes:
			hit_toks = n.split("*")
			hit_ID = hit_toks[0]
			curr_hit = self.unfits[hit_ID]
			toks = curr_hit.toks
			strain = hit_ID.split("|")[0]
			unfit_cluster = curr_hit.clusters["unfit"]
			hit_clusters = curr_hit.clusters["hit"]
			outfile.write(self.sep.join(map(str,[strain, unfit_cluster, hit_clusters ,toks['Domain'].tolist()[0],toks['HMMER_score'].tolist()[0],
				toks['Contig'].tolist()[0],toks['Strand'].tolist()[0], toks['Hit_length'].tolist()[0],toks['Hit_start'].tolist()[0],toks['Hit_stop'].tolist()[0],
				toks['Sequence'].tolist()[0],toks['Reason1'].tolist()[0], toks['Reason2'].tolist()[0]] )) + "\n")
		outfile.close()
コード例 #18
0
def write_single_output(args, network_type, keys, hits, group_dir, label,
                        curr_nodes, num_copies, domains, scores,
                        toxins_lengths, antitoxins_lengths, deltas, directions,
                        reasons):
    ''' create the output file for a toxin cluster
    Slightly messy because the outputs are different,
    depending if its a toxin, antitoxin or both...'''
    utils.assure_path_exists(
        os.path.join(group_dir, network_type + "_clusters"))
    outfile = open(
        os.path.join(group_dir, network_type + "_clusters", label + ".txt"),
        "w")

    ### Attributes of this cluster ####
    outfile.write("#  ID: " + label + "\n")
    outfile.write("#  Num copies: " + str(num_copies) + "\n")
    outfile.write("#  Domains: " + ",".join(domains) + "\n")
    outfile.write("#  Average Hit Length: " + str(np.mean(toxins_lengths)) +
                  "\n")
    outfile.write("#  Average HMMER score: " + str(np.mean(scores)) + "\n")
    if network_type in ["partners", "upstream", "downstream"]:
        outfile.write("##  Average Delta: " + str(np.mean(deltas)) + "\n")
        outfile.write("##  Average Partner Length: " +
                      str(np.mean(antitoxins_lengths)) + "\n")
    if args.order == "either" and network_type != "unfit":
        outfile.write("#  Order: Upstream: " + str(directions["standard"]) +
                      " Downstream: " + str(directions["reverse"]) + "\n")
    if network_type == "unfit":
        outfile.write("# Reasons: " + ",".join(map(str, list(reasons))) + "\n")

    ### HEADER ###
    if network_type == "hits":
        if args.order in ["either", "both"]:
            outfile.write(
                args.sep.join(
                    ['Strain', 'Upstream_Cluster', 'Downstream_Cluster'] +
                    keys) + "\n")
        else:  ## either upstream or downstream
            ## only upstream antitoxin
            outfile.write(
                args.sep.join(['Strain', 'Partner_Cluster'] + keys) + "\n")
    elif network_type == "unfit":
        outfile.write(args.sep.join(['Strain', 'Hit_Cluster'] + keys) + "\n")
    else:  # network_type in ["partners", "upstream", "downstream"]: ## antitoxins output
        ## only antitoxins
        outfile.write(
            args.sep.join([
                "Strain", "Hit_Cluster", "Domain", "HMMER_score", "Order",
                "Contig", "Strand", "Partner_Length", "Delta", "Partner_Start",
                "Partner_Stop", "Hit_Length", "Hit_Start", "Hit_Stop",
                "Partner", "Sequence", "Source"
            ]) + "\n")

    ### one line per member of this cluster ###
    for n in curr_nodes:
        hit_toks = n.split("*")
        hit_type = hit_toks[1]
        hit_ID = hit_toks[0]
        curr_hit = hits[hit_ID]
        strain = hit_ID.split("|")[0]
        if hit_type == "hit":  ## toxins
            if args.order in ["either", "both"]:
                outfile.write(
                    args.sep.join([
                        strain, curr_hit["clusters"]["upstream"],
                        curr_hit["clusters"]["downstream"]
                    ]))
            elif args.order == "upstream":
                outfile.write(
                    args.sep.join([strain, curr_hit["clusters"]["upstream"]]))
            else:
                outfile.write(
                    args.sep.join([strain,
                                   curr_hit["clusters"]["downstream"]]))
            for k in keys:
                outfile.write("," + str(curr_hit[k]))
            outfile.write("\n")
        elif hit_type == "unfit":
            hit_clusters = "+".join(list(curr_hit["clusters"]["hit"]))
            outfile.write(args.sep.join([strain, hit_clusters]))
            for k in keys:
                outfile.write("," + str(curr_hit[k]))
            outfile.write("\n")
        else:  ## antitoxin
            cluster = curr_hit["clusters"]["hit"]
            if hit_type == "upstream":
                outfile.write(
                    args.sep.join(
                        map(str, [
                            strain, cluster, curr_hit['Domain'],
                            curr_hit['HMMER_score'], "upstream",
                            curr_hit['Contig'], curr_hit['Strand'],
                            curr_hit['Upstream_length'],
                            curr_hit['Upstream_delta'],
                            curr_hit['Upstream_start'],
                            curr_hit['Upstream_stop'], curr_hit['Hit_length'],
                            curr_hit['Hit_start'], curr_hit['Hit_stop'],
                            curr_hit['Upstream'], curr_hit['Hit'],
                            curr_hit['Source']
                        ])) + "\n")
            else:  ## Downstream
                outfile.write(
                    args.sep.join(
                        map(str, [
                            strain, cluster, curr_hit['Domain'],
                            curr_hit['HMMER_score'], "downstream",
                            curr_hit['Contig'], curr_hit['Strand'],
                            curr_hit['Downstream_length'],
                            curr_hit['Downstream_delta'],
                            curr_hit['Downstream_start'],
                            curr_hit['Downstream_stop'],
                            curr_hit['Hit_length'], curr_hit['Hit_start'],
                            curr_hit['Hit_stop'], curr_hit['Downstream'],
                            curr_hit['Hit'], curr_hit['Source']
                        ])) + "\n")
    outfile.close()
    return
コード例 #19
0
    def _hits_to_fasta(
        self
    ):  # combine all the hits and write them into a single fasta file to run blast
        utils.assure_path_exists(self.out_dir)
        hits = open(os.path.join(self.out_dir, "hits.fasta"), "w")

        if self.order == "both":
            downstream = open(os.path.join(self.out_dir, "downstream.fasta"),
                              "w")
            upstream = open(os.path.join(self.out_dir, "upstream.fasta"), "w")
        else:
            partners = open(os.path.join(self.out_dir, "partners.fasta"), "w")

        for file in os.listdir(self.results_dir):
            if not file.endswith(".csv"):
                continue
            strain = os.path.basename(file)
            strain = strain.replace(".csv", "")

            with open(os.path.join(self.results_dir, file)) as f:
                line_num = 1
                for line in f:
                    if line.startswith("Domain"):
                        continue
                    toks = line.strip().split(self.sep)
                    identifier = ">" + strain + "|" + str(line_num)

                    if self.order == "either" or self.order == "both":
                        hits.write(identifier + "*hit" + "\n" + toks[15] +
                                   "\n")
                        if self.order == "either":
                            if toks[16] != "":
                                partners.write(identifier + "*upstream" +
                                               "\n" + toks[16] + "\n")
                            if toks[17] != "":
                                partners.write(identifier + "*downstream" +
                                               "\n" + toks[17] + "\n")
                        else:
                            upstream.write(identifier + "*upstream" + "\n" +
                                           toks[16] + "\n")
                            downstream.write(identifier + "*downstream" +
                                             "\n" + toks[17] + "\n")
                    else:
                        hits.write(identifier + "*hit" + "\n" + toks[11] +
                                   "\n")
                        partners.write(identifier + "*" + self.order + "\n" +
                                       toks[12] + "\n")

                    line_num += 1

        ## if in the previous step, the unfit were also reported, add them to the "hits" in the network analysis
        if self.report_unfit and not os.path.exists(
                os.path.join(self.results_dir, "UNFIT")):
            warnings.warn(
                "Could not find UNFIT files from SUMMARISE step. To report unfit, turn on --report_unfit / -u flag in FILTER and run again."
            )

        if os.path.exists(os.path.join(self.results_dir, "UNFIT")):
            for file in os.listdir(os.path.join(self.results_dir, "UNFIT")):
                if not file.endswith(".csv"):
                    continue
                with open(os.path.join(self.results_dir, "UNFIT", file)) as f:
                    line_num = 1
                    for line in f:
                        if line.startswith("Strain"):
                            continue
                        toks = line.strip().split(",")
                        strain = toks[0]
                        identifier = ">" + strain + "|" + str(
                            line_num) + "*unfit"
                        hits.write(identifier + "\n" + toks[8] + "\n")
                        line_num += 1

        hits.close()

        if self.order == "both":
            upstream.close()
            downstream.close()
        else:
            partners.close()
コード例 #20
0
ファイル: scan.py プロジェクト: ghtester123/sling
	def run(self):
		
		## create output directory
		utils.assure_path_exists(self.args["scan_dir"])

		if self.args["hmm_db"] not in utils.databases:
			self._run_hmmpress()
		else:## before starting, copy the data directory to the out direcotry
			d = os.path.abspath(os.path.dirname(__file__))
			data_env = os.path.join(d, 'data/')
			os.system("mkdir -p " + os.path.join(self.args["scan_dir"],"data"))
			## Do both??? -> this seems to choose how to behave arbitrarily
			os.system("cp -r " + data_env + " " + os.path.join(self.args["scan_dir"]))
			os.system("cp -r " + data_env + " " + os.path.join(self.args["scan_dir"],"data"))
			self.args["hmm_db"] = os.path.join(self.args["scan_dir"],'data',self.args["hmm_db"])

		

		
		## get version of hmmscan for log file
		log_other =  self._get_hmmer_version("hmmscan")
		log_other = log_other + self._get_hmmer_version("hmmpress")
		log_other = log_other + "###   INPUT   ### \ncnt\tgenome\tfasta_file\tgff_file\n" # keeping a text file of all the genomes used

		jobs = []
		cnt = 1
		for file in os.listdir(self.args["prep_dir"]):
			if file.endswith(".sixframe.fasta"):
				basename = os.path.basename(file)
				basename = basename.replace(".sixframe.fasta","")

				sixframe_file = os.path.join(self.args["prep_dir"], basename + ".sixframe.fasta")
				annotated_file = os.path.join(self.args["prep_dir"], basename + ".annotated.fasta")

				scan_genome = {"basename" :basename, "source": "sixframe", "fasta_file": sixframe_file, "out_dir": self.args["scan_dir"], "hmm_db": self.args["hmm_db"], "hmmscan": self.args["configs"]["hmmscan"]}
				jobs.append(scan_genome)


				if os.path.isfile(annotated_file):
					scan_genome = {"basename" :basename, "source": "annotated", "fasta_file": annotated_file, "out_dir": self.args["scan_dir"], "hmm_db": self.args["hmm_db"], "hmmscan": self.args["configs"]["hmmscan"]}
					jobs.append(scan_genome)
					log_other = log_other + str(cnt) +"\t" + basename +"\t"+ sixframe_file +"\t"+ annotated_file+"\n"
				else:
					log_other = log_other + str(cnt) +"\t" + basename +"\t"+ sixframe_file +"\tnot found\n"
				cnt += 1


		utils.write_log(os.path.join(self.args["scan_dir"], "LOG"), "STEP 2 : GENOME SCANNING", self.args, log_other)

		## Allocated CPUs in an efficient way
		CPUs_per_scan = 5
		
		
		## if there are more CPUs than jobs, or we have less than 5 CPUs, don't use multiprocessing, just give 
		## the maximum number of CPUs to each hmmscan call
		if self.args["cpu"] > len(jobs) or self.args["cpu"] < CPUs_per_scan:
			pool_size = 1
			CPUs_per_scan = self.args["cpu"]
		## otherwise, create pool of size CPUS/5, and each scan will run on 5 CPUs (what to do with CPUs I lose?)
		else:
			pool_size =  self.args["cpu"] / CPUs_per_scan 

		for j in jobs:
			j["scan_cpu"] = CPUs_per_scan
		


		pool = multiprocessing.Pool(processes = pool_size)
		results = pool.map_async(run_scan,tuple(jobs))
		pool.close()
		pool.join()