Exemple #1
0
def get_unique_genes_for_organism(res_organism: str, res_genes: List[Gene],
                                  sus_organisms: List[str],
                                  drug_dirs: dir_utils.DrugDirs):
    """
    This function compares the genes of one resistant organism to all susceptible organisms and their genes.
    The list of unique genes is then outputted.
    """
    print(f"Genes to check: {len(res_genes)}")
    for sus_organism in sus_organisms:
        unique_genes = []

        # go through every resistant gene for the organism
        for res_gene in res_genes:
            blast_data = blast.blast(res_gene, sus_organism)

            # only add genes that did not perfectly match
            if not blast_data:
                unique_genes.append(res_gene)
                continue
            if not blast_data.is_perfect_match:
                unique_genes.append(res_gene)

        # replace all res_genes with only unique_genes
        print(f"Genes to check: {len(unique_genes)}")
        res_genes = unique_genes

    # output final unique genes for organism
    output_file = output_util.OutputFile(
        file_path=os.path.join(drug_dirs.unique_res_genes,
                               f"{res_organism}.csv"),
        header_list=["gene_name", "gene_info"])
    for res_gene in res_genes:
        output_file.write_data_list_to_output_file(
            [res_gene.name, res_gene.description])
Exemple #2
0
def gather_potential_unique_combinations(
        drug_dirs: dir_utils.DrugDirs,
        write_output: bool = False) -> Dict[Gene, List[Gene]]:
    """
    Takes all the unique genes for the resistant set and outputs the list with descriptions for each gene as well
    as the resistant organisms that are uniquely matched.
    """
    # gather all genes organized by organism
    res_unique_genes_by_org = gen_utils.get_organism_and_all_genes_from_folder_csv(
        drug_dirs.unique_res_genes, remove_hypothetical=True)
    final_gene_output = defaultdict(list)
    for organism, gene_list in res_unique_genes_by_org.items():
        print(f"Gathering unique genes from organism: {organism}")
        for gene in gene_list:
            # check if gene name is in output
            if gene.description in final_gene_output:
                # add it if there is not already a copy in the output
                if organism not in final_gene_output[gene.description]:
                    final_gene_output[gene.description].append(gene)
            else:
                final_gene_output[gene.description].append(gene)

    # create output file for potential unique genes
    if write_output:
        output_file = output_util.OutputFile(
            file_path=drug_dirs.potential_uniques,
            header_list=["gene", "res_organisms"])
        for gene, gene_list in final_gene_output.items():
            org_list = gene_utils.get_organisms_from_list_of_genes(gene_list)
            output_file.write_data_list_to_output_file([gene, org_list])

    return final_gene_output
Exemple #3
0
def analyze_unique_gene_clusters(drug_dirs: dir_utils.DrugDirs):
    """
    Parent function for gathering and analyzing gene clusters for resistant organisms.
    """
    sus_organisms = gen_utils.get_organisms_by_phenotype(drug_dirs.sus_file)
    cluster_info: dict = _get_gene_clusters(drug_dirs.cluster_dir)
    cluster_info_file = output_util.OutputFile(
        file_path=drug_dirs.cluster_info,
        header_list=["cluster", "gene", "count", "organisms"])
    unique_clusters_file = output_util.OutputFile(
        file_path=drug_dirs.unique_clusters,
        header_list=["cluster", "gene", "count", "organisms"])
    process_data = []
    # go though each cluster as well the info related ot the cluster
    for cluster, info in cluster_info.items():
        print(f"Processing {cluster=}")

        first_gene: SeqRecord = info[0]
        organisms = []
        genes = []
        for gene_info in info:
            gene_description = gene_info.description
            gene_organism = gene_description.split("~")[1]
            organisms.append(gene_organism)
            genes.append(
                gene.Gene(organism=gene_organism, gene_name=gene_info.name))

        # write data for each cluster and prepare processes for analyzing if the clusters are unique
        cluster_info_file.write_data_list_to_output_file(
            [cluster, first_gene.description,
             len(info), organisms])
        process_data.append((
            cluster,
            unique_clusters_file,
            genes,
            sus_organisms,
        ))

    process_handler = gen_utils.MultiProcessHandler(
        max_processes=MAX_PROCESSES,
        target=_get_unique_clusters,
        input_list=process_data)
    process_handler.start()
def get_reciprocal_genes_for_organism(organism, res_genes, res_organisms, sus_organisms, drug_dirs):
    all_results = []
    count = 0
    gene_list_length = len(res_genes)
    for res_gene in res_genes:
        count += 1
        combined_result = blast.CombinedResult(res_gene.name)
        print(f"Organism: {organism} | Count: {count} / {gene_list_length}")

        if gene.check_if_unique(res_gene, sus_organisms):
            for res_organism in res_organisms:
                if res_organism == organism:
                    continue

                blast_data = blast.blast(res_gene, res_organism)
                if not blast_data:
                    continue

                if not blast_data.is_homolog:
                    continue

                reciprocal_blast = blast.blast(blast_data.blast_gene, organism)
                if not reciprocal_blast:
                    continue

                if reciprocal_blast.is_homolog:
                    if reciprocal_blast.blast_gene.name == blast_data.target_gene.name:
                        combined_result.add_new_result(blast_data)
                else:
                    continue

        all_results.append(combined_result)

    # output final unique genes for organism
    output_file = output_util.OutputFile(file_path=os.path.join(drug_dirs.reciprocal_res_genes, f"{organism}.csv"),
                                         header_list=all_results[0].header())
    for result in all_results:
        output_file.write_data_list_to_output_file(result.data())
Exemple #5
0
def check_unique_clusters_for_genes_of_interest(
        drug_dirs: dir_utils.DrugDirs,
        organism_file_path: str,
        genes_to_collect: Optional[List[str]] = None,
        genes_to_filter: Optional[List[str]] = None):

    sus_organisms = gen_utils.get_organisms_by_phenotype(drug_dirs.sus_file)

    # get all unique clusters
    cluster_data = pd.read_csv(drug_dirs.unique_clusters, header=0)

    # filter clusters for genes of interest, removing any that should be filtered
    filtered_clusters = []
    for index, row in cluster_data.iterrows():
        gene_name = row["gene"]
        should_filter = gen_utils.check_if_gene_in_keyword_list(
            gene_name, genes_to_filter)
        # not filtering cluster
        if not should_filter:
            should_collect = gen_utils.check_if_gene_in_keyword_list(
                gene_name, genes_to_collect)

            # gene is in keyword list
            if should_collect:
                filtered_clusters.append(row)

    # get all genes of new organism
    organism_gene_list = gen_utils.get_list_of_genes_from_fasta_file(
        organism_file_path)

    # create fasta file and dir for each file (needed for blast)
    gene_object_list = []
    dir_utils.generate_dir(drug_dirs.new_organism_dir, overwrite_dir=True)
    for organism_gene in organism_gene_list:
        organism_gene.id = re.sub(r'[\\/*?:"<>|]', "",
                                  organism_gene.id).replace(".", "_")
        with open(
                os.path.join(drug_dirs.new_organism_dir,
                             f"{organism_gene.id}.fasta"),
                "w") as output_handle:
            SeqIO.write(organism_gene, output_handle, "fasta")

        gene_object = gene.Gene("new_organism",
                                organism_gene.id,
                                new_organism=True)
        gene_object.description = organism_gene.description
        gene_object_list.append(gene_object)

    # check if gene is unique to resistant group
    unique_to_resistant: List[gene.Gene] = []
    for organism_gene in gene_object_list:
        print(organism_gene.description)
        is_unique = gene.check_if_unique(organism_gene, sus_organisms)
        if is_unique:
            unique_to_resistant.append(organism_gene)

    # check if gene belongs in cluster?

    # output genes that are unique/belong to cluster
    output_file = output_util.OutputFile(
        file_path=drug_dirs.investigated_unique_genes,
        header_list=["gene_name"])
    for result in unique_to_resistant:
        output_file.write_data_list_to_output_file(result.description)
Exemple #6
0
def investigate_potential_unique_combinations(
        drug_dirs: dir_utils.DrugDirs,
        genes_to_collect: Optional[List[str]] = None,
        genes_to_filter: Optional[List[str]] = None,
        write_output: bool = False):
    """
    This function will output filtered genes that are unique to the resistant set of organisms.
    """
    # determine if any of the genes need to be removed/filtered
    if genes_to_collect is None:
        genes_to_collect = []
    if genes_to_filter is None:
        genes_to_filter = []

    filtered_unique_combinations: dict = gather_filtered_potential_unique_genes(
        drug_dirs, genes_to_collect, genes_to_filter, write_output)
    res_organisms = gen_utils.get_organisms_by_phenotype(drug_dirs.res_file)

    final_gene_output = []
    # go through each gene in the filtered set of genes
    for gene, gene_list in filtered_unique_combinations.items():
        # get the unique organisms for the set of genes
        unique_organisms = set(
            gene_utils.get_organisms_from_list_of_genes(gene_list))
        # compare against other resistant organisms that do not contain unique copies of the gene of interest
        not_unique_organisms = res_organisms - unique_organisms

        print(f"Checking gene: {gene}")
        for potential_gene in gene_list:
            combined_result = blast.UniqueGeneCompareResult(
                potential_gene,
                unique_group=unique_organisms,
                not_unique_group=not_unique_organisms)

            # check all other unique organisms for a perfect match
            for unique_organism in unique_organisms:
                # we don't want to compare the gene to itself.
                if unique_organism == potential_gene.organism:
                    continue

                unique_blast = blast.blast(potential_gene, unique_organism)
                if not unique_blast:
                    continue

                combined_result.add_new_result(unique_blast)

            # check all non unique organisms for a perfect match
            for not_unique_organism in not_unique_organisms:
                not_unique_blast = blast.blast(potential_gene,
                                               not_unique_organism)
                if not not_unique_blast:
                    continue

                combined_result.add_new_sus_result(not_unique_blast)

            final_gene_output.append(combined_result)
            print(f"Finished checking gene: {gene}")

    if write_output:
        # output final unique genes for organism
        output_file = output_util.OutputFile(
            file_path=drug_dirs.investigated_unique_genes,
            header_list=final_gene_output[0].header())
        for result in final_gene_output:
            output_file.write_data_list_to_output_file(result.data())

    return final_gene_output