def get_unique_genes_for_organism(res_organism: str, res_genes: List[Gene], sus_organisms: List[str], drug_dirs: dir_utils.DrugDirs): """ This function compares the genes of one resistant organism to all susceptible organisms and their genes. The list of unique genes is then outputted. """ print(f"Genes to check: {len(res_genes)}") for sus_organism in sus_organisms: unique_genes = [] # go through every resistant gene for the organism for res_gene in res_genes: blast_data = blast.blast(res_gene, sus_organism) # only add genes that did not perfectly match if not blast_data: unique_genes.append(res_gene) continue if not blast_data.is_perfect_match: unique_genes.append(res_gene) # replace all res_genes with only unique_genes print(f"Genes to check: {len(unique_genes)}") res_genes = unique_genes # output final unique genes for organism output_file = output_util.OutputFile( file_path=os.path.join(drug_dirs.unique_res_genes, f"{res_organism}.csv"), header_list=["gene_name", "gene_info"]) for res_gene in res_genes: output_file.write_data_list_to_output_file( [res_gene.name, res_gene.description])
def gather_potential_unique_combinations( drug_dirs: dir_utils.DrugDirs, write_output: bool = False) -> Dict[Gene, List[Gene]]: """ Takes all the unique genes for the resistant set and outputs the list with descriptions for each gene as well as the resistant organisms that are uniquely matched. """ # gather all genes organized by organism res_unique_genes_by_org = gen_utils.get_organism_and_all_genes_from_folder_csv( drug_dirs.unique_res_genes, remove_hypothetical=True) final_gene_output = defaultdict(list) for organism, gene_list in res_unique_genes_by_org.items(): print(f"Gathering unique genes from organism: {organism}") for gene in gene_list: # check if gene name is in output if gene.description in final_gene_output: # add it if there is not already a copy in the output if organism not in final_gene_output[gene.description]: final_gene_output[gene.description].append(gene) else: final_gene_output[gene.description].append(gene) # create output file for potential unique genes if write_output: output_file = output_util.OutputFile( file_path=drug_dirs.potential_uniques, header_list=["gene", "res_organisms"]) for gene, gene_list in final_gene_output.items(): org_list = gene_utils.get_organisms_from_list_of_genes(gene_list) output_file.write_data_list_to_output_file([gene, org_list]) return final_gene_output
def analyze_unique_gene_clusters(drug_dirs: dir_utils.DrugDirs): """ Parent function for gathering and analyzing gene clusters for resistant organisms. """ sus_organisms = gen_utils.get_organisms_by_phenotype(drug_dirs.sus_file) cluster_info: dict = _get_gene_clusters(drug_dirs.cluster_dir) cluster_info_file = output_util.OutputFile( file_path=drug_dirs.cluster_info, header_list=["cluster", "gene", "count", "organisms"]) unique_clusters_file = output_util.OutputFile( file_path=drug_dirs.unique_clusters, header_list=["cluster", "gene", "count", "organisms"]) process_data = [] # go though each cluster as well the info related ot the cluster for cluster, info in cluster_info.items(): print(f"Processing {cluster=}") first_gene: SeqRecord = info[0] organisms = [] genes = [] for gene_info in info: gene_description = gene_info.description gene_organism = gene_description.split("~")[1] organisms.append(gene_organism) genes.append( gene.Gene(organism=gene_organism, gene_name=gene_info.name)) # write data for each cluster and prepare processes for analyzing if the clusters are unique cluster_info_file.write_data_list_to_output_file( [cluster, first_gene.description, len(info), organisms]) process_data.append(( cluster, unique_clusters_file, genes, sus_organisms, )) process_handler = gen_utils.MultiProcessHandler( max_processes=MAX_PROCESSES, target=_get_unique_clusters, input_list=process_data) process_handler.start()
def get_reciprocal_genes_for_organism(organism, res_genes, res_organisms, sus_organisms, drug_dirs): all_results = [] count = 0 gene_list_length = len(res_genes) for res_gene in res_genes: count += 1 combined_result = blast.CombinedResult(res_gene.name) print(f"Organism: {organism} | Count: {count} / {gene_list_length}") if gene.check_if_unique(res_gene, sus_organisms): for res_organism in res_organisms: if res_organism == organism: continue blast_data = blast.blast(res_gene, res_organism) if not blast_data: continue if not blast_data.is_homolog: continue reciprocal_blast = blast.blast(blast_data.blast_gene, organism) if not reciprocal_blast: continue if reciprocal_blast.is_homolog: if reciprocal_blast.blast_gene.name == blast_data.target_gene.name: combined_result.add_new_result(blast_data) else: continue all_results.append(combined_result) # output final unique genes for organism output_file = output_util.OutputFile(file_path=os.path.join(drug_dirs.reciprocal_res_genes, f"{organism}.csv"), header_list=all_results[0].header()) for result in all_results: output_file.write_data_list_to_output_file(result.data())
def check_unique_clusters_for_genes_of_interest( drug_dirs: dir_utils.DrugDirs, organism_file_path: str, genes_to_collect: Optional[List[str]] = None, genes_to_filter: Optional[List[str]] = None): sus_organisms = gen_utils.get_organisms_by_phenotype(drug_dirs.sus_file) # get all unique clusters cluster_data = pd.read_csv(drug_dirs.unique_clusters, header=0) # filter clusters for genes of interest, removing any that should be filtered filtered_clusters = [] for index, row in cluster_data.iterrows(): gene_name = row["gene"] should_filter = gen_utils.check_if_gene_in_keyword_list( gene_name, genes_to_filter) # not filtering cluster if not should_filter: should_collect = gen_utils.check_if_gene_in_keyword_list( gene_name, genes_to_collect) # gene is in keyword list if should_collect: filtered_clusters.append(row) # get all genes of new organism organism_gene_list = gen_utils.get_list_of_genes_from_fasta_file( organism_file_path) # create fasta file and dir for each file (needed for blast) gene_object_list = [] dir_utils.generate_dir(drug_dirs.new_organism_dir, overwrite_dir=True) for organism_gene in organism_gene_list: organism_gene.id = re.sub(r'[\\/*?:"<>|]', "", organism_gene.id).replace(".", "_") with open( os.path.join(drug_dirs.new_organism_dir, f"{organism_gene.id}.fasta"), "w") as output_handle: SeqIO.write(organism_gene, output_handle, "fasta") gene_object = gene.Gene("new_organism", organism_gene.id, new_organism=True) gene_object.description = organism_gene.description gene_object_list.append(gene_object) # check if gene is unique to resistant group unique_to_resistant: List[gene.Gene] = [] for organism_gene in gene_object_list: print(organism_gene.description) is_unique = gene.check_if_unique(organism_gene, sus_organisms) if is_unique: unique_to_resistant.append(organism_gene) # check if gene belongs in cluster? # output genes that are unique/belong to cluster output_file = output_util.OutputFile( file_path=drug_dirs.investigated_unique_genes, header_list=["gene_name"]) for result in unique_to_resistant: output_file.write_data_list_to_output_file(result.description)
def investigate_potential_unique_combinations( drug_dirs: dir_utils.DrugDirs, genes_to_collect: Optional[List[str]] = None, genes_to_filter: Optional[List[str]] = None, write_output: bool = False): """ This function will output filtered genes that are unique to the resistant set of organisms. """ # determine if any of the genes need to be removed/filtered if genes_to_collect is None: genes_to_collect = [] if genes_to_filter is None: genes_to_filter = [] filtered_unique_combinations: dict = gather_filtered_potential_unique_genes( drug_dirs, genes_to_collect, genes_to_filter, write_output) res_organisms = gen_utils.get_organisms_by_phenotype(drug_dirs.res_file) final_gene_output = [] # go through each gene in the filtered set of genes for gene, gene_list in filtered_unique_combinations.items(): # get the unique organisms for the set of genes unique_organisms = set( gene_utils.get_organisms_from_list_of_genes(gene_list)) # compare against other resistant organisms that do not contain unique copies of the gene of interest not_unique_organisms = res_organisms - unique_organisms print(f"Checking gene: {gene}") for potential_gene in gene_list: combined_result = blast.UniqueGeneCompareResult( potential_gene, unique_group=unique_organisms, not_unique_group=not_unique_organisms) # check all other unique organisms for a perfect match for unique_organism in unique_organisms: # we don't want to compare the gene to itself. if unique_organism == potential_gene.organism: continue unique_blast = blast.blast(potential_gene, unique_organism) if not unique_blast: continue combined_result.add_new_result(unique_blast) # check all non unique organisms for a perfect match for not_unique_organism in not_unique_organisms: not_unique_blast = blast.blast(potential_gene, not_unique_organism) if not not_unique_blast: continue combined_result.add_new_sus_result(not_unique_blast) final_gene_output.append(combined_result) print(f"Finished checking gene: {gene}") if write_output: # output final unique genes for organism output_file = output_util.OutputFile( file_path=drug_dirs.investigated_unique_genes, header_list=final_gene_output[0].header()) for result in final_gene_output: output_file.write_data_list_to_output_file(result.data()) return final_gene_output