def load_GO_annotations_mapper(args):
    # Load Gene Ontology -> Pfam annotation table
    logger.verbose(
        "Loading Pfam to GO Biological Process annotations mappings")
    pfam_families = file_utilities.load_json(args["pfam_families_filename"])
    GO_to_pfam_mapping = file_utilities.load_json(args["go_to_pfam_filename"])
    logger.verbose(
        "Loading of Pfam to GO Biological Process annotations mappings done")

    return pfam_families, GO_to_pfam_mapping
def import_gene_coord_data(organism_name, args):
    coordinates_file, gene_to_chr_mapping_file = \
        file_utilities.get_coord_file_from_id(organism_name,
                                              args["parsed_coordinates_filename_suffix"],
                                              args["gene_locations_filename_suffix"],
                                              args["out_dir"])
    gene_coordinates = import_parsed_coordinates(coordinates_file)
    gene_to_chr_mapping = file_utilities.load_json(gene_to_chr_mapping_file)

    return gene_coordinates, gene_to_chr_mapping
Example #3
0
def parse_coordinates(args, strand=False):
    logger.info("Started parsing assembly gene coordinates files")

    # Create a dictionary for the sizes of each genome.
    # If one is already present, import it and append any new items.
    if os.path.exists(
            args["genome_sizes_filename"]) and args['force'] is False:
        try:
            genome_sizes_dictionary = \
                manager.dict(file_utilities.load_json(args["genome_sizes_filename"]))
        except EOFError:
            genome_sizes_dictionary = manager.dict()
    else:
        genome_sizes_dictionary = manager.dict()

    if len(genome_sizes_dictionary) >= len(args['gbk_files_list']):
        del args["gbk_files_list"]
        return genome_sizes_dictionary
    else:
        # Checks if everything has already been parsed
        args["parsed_coordinates_file_list"] = \
            file_utilities.get_file_list(args['parsed_coordinates_out_dir'],
                                     args["parsed_coordinates_filename_suffix"],
                                     verbose=True, error=False)

        # If there are files to process, starts to parse them:
        if len(args["gbk_files_list"]) > 0:
            parsed_coordinates_file_list = manager.list()

            multi = MyMultiProcess(threads=args["threads"],
                                   target=create_parsed_coordinate_dictionary,
                                   input=args["gbk_files_list"],
                                   args=[
                                       genome_sizes_dictionary,
                                       parsed_coordinates_file_list, strand,
                                       args
                                   ])
            multi.run()

        if len(args["gbk_files_list"]) > 0:
            logger.info(
                "Parsed gene coordinates files for %s assemblies generated and saved in folder %s",
                len(args["gbk_files_list"]),
                args['parsed_coordinates_out_dir'])
        logger.info("Individual DNA molecule sizes saved in %s",
                    args['genome_sizes_filename'])

        args["parsed_coordinates_file_list"] = list(
            parsed_coordinates_file_list)

    return genome_sizes_dictionary
Example #4
0
def load_all_organisms_coordinates(args):
    logger.info("Loading domain coordinates for all organisms")
    bar = Bar('Loading organism_data',
              max=len(args['pfam_annotations_file_list']))
    all_gene_coordinates = {}
    pfam_families = file_utilities.load_json(args["pfam_families_filename"])

    for pfam_annotations_file in args['pfam_annotations_file_list']:
        organism_name = \
            file_utilities. \
                get_organism_from_file(pfam_annotations_file,
                                       args["pfam_annotations_filename_suffix"])
        genome_id, genome_size = args["genome_sizes"][organism_name][0]
        gene_coordinates, gene_to_chr_mapping = import_gene_coord_data(
            organism_name, args)
        gene_hits = import_pfam_hits(pfam_annotations_file,
                                     gene_to_chr_mapping,
                                     args['evalue'],
                                     pfam_families,
                                     genome_id,
                                     domain_centric=False)

        coords = np.zeros([len(gene_hits), 2], dtype=int)
        probs = np.zeros(len(gene_hits), dtype=float)
        total_prob = 0
        for key in gene_hits:
            total_prob += len(gene_hits[key])
        for index, gene in enumerate(gene_hits.keys()):
            coords[index] = np.array(
                [gene_coordinates[gene][0], gene_coordinates[gene][1]])
            probs[index] = len(gene_hits[gene]) / total_prob

        all_gene_coordinates[organism_name] = \
            [compute_uniform_probability(float(genome_size) / 2),
             float(genome_size),
             coords,
             probs]
        bar.next()

    bar.finish()
    logger.info("Domain coordinates for all organisms loaded")
    return all_gene_coordinates
Example #5
0
def compute_organisms_probabilities(args):
    """This function is an aggregator to compute the exponential and uniform likelihood
    of all single genomes, using multiprocess.
    """
    pfam_families = file_utilities.load_json(args["pfam_families_filename"])

    pfam_annotations_file_list = args["pfam_annotations_file_list"]
    n_organisms = len(pfam_annotations_file_list)

    if not os.path.exists(args["uniform_probabilities_filename"]) or \
            args["force"] is True:
        uniform_probabilities = manager.dict()
    else:
        uniform_probabilities = manager.dict(
            probabilities_files.load_uniform_probabilities(
                args["uniform_probabilities_filename"]))

    number_of_pairs = indexing_domain_pairs.index_all_pairs(
        pfam_annotations_file_list, pfam_families, args)

    logger.info("Started computing organism-specific clustering "
                "probabilities of Pfam domains")

    multi = MyMultiProcess(threads=args["threads"],
                           target=organism_domain_clustering_probabilities,
                           input=pfam_annotations_file_list,
                           args=[uniform_probabilities, pfam_families, args])
    multi.run()

    logger.info(
        "Computation of organism-specific clustering probabilities of "
        "Pfam domains for %s organisms done", n_organisms)
    logger.info(
        "All individual organism probabilities were saved in directory %s",
        args["individual_probabilities_dir"])

    return uniform_probabilities, number_of_pairs
def load_uniform_probabilities(uniform_probability_filename):
    return file_utilities.load_json(uniform_probability_filename)
def import_parsed_coordinates(coord_file):
    """Import the ordered dictionary with the parsed genomic coordinates of each gene."""
    return file_utilities.load_json(coord_file)
Example #8
0
def organism_domain_clustering_probabilities(pfam_annotations_file,
                                             uniform_probabilities,
                                             pfam_families, args):
    """This function computes the exp and unif likelihood
    for all pairs from a single genome."""

    organism_name = \
        file_utilities.get_organism_from_file(pfam_annotations_file,
                                              args[
                                                  "pfam_annotations_filename_suffix"])

    # Output file
    probabilities_filename = \
        os.path.join(args["individual_probabilities_dir"],
                     organism_name + args["probabilities_filename_suffix"]
                     + ".json")

    if not os.path.exists(probabilities_filename) or args['force'] is True:
        individual_indexes_filename = os.path.join(
            args["pairs_indexes_dir"],
            args["pairs_indexes_individual_template"].format(organism_name))

        all_domain_pair_indexes = file_utilities.load_json(
            individual_indexes_filename)

        # Copy step necessary, because the dictionary is shared among threads and the
        # following checks make the script slower if ran in multithread:
        genome_sizes_org = [args["genome_sizes"][organism_name][0]]
        genome_numeric_id = 0

        for genome_molecule in genome_sizes_org:
            genome_key = \
                file_utilities.get_organism_id_from_name(organism_name,
                                                         genome_numeric_id)

            logger.verbose(
                "Started computing individual clustering "
                "probabilities from file %s", pfam_annotations_file)

            organism_probabilities = {}

            genome_size, genome_id = genome_molecule[1], genome_molecule[0]

            try:
                gene_coordinates
            except NameError:
                # Loads gene coordinates for this chromosome
                gene_coordinates, gene_to_chr_mapping = \
                    load_genome_annotations.import_gene_coord_data(
                        organism_name, args)

            # Import Pfam annotations for this chromosome
            domain_hits = \
                load_genome_annotations.import_pfam_hits(pfam_annotations_file,
                                                         gene_to_chr_mapping,
                                                         args["evalue"],
                                                         pfam_families,
                                                         genome_id)

            if len(domain_hits) > 0:
                # Saves the exponential probabilities for this chromosome
                organism_probabilities[genome_key] = \
                    compute_probabilities(domain_hits,
                                          all_domain_pair_indexes,
                                          gene_coordinates,
                                          genome_id,
                                          genome_size,
                                          pfam_annotations_file,
                                          args)

                # Computes the uniform probability for this chromosome
                uniform_probabilities[genome_key] = \
                    compute_uniform_probability(float(genome_size) / 2)

            genome_numeric_id += 1

        file_utilities.save_json(organism_probabilities,
                                 probabilities_filename)
        probabilities_files \
            .save_uniform_probabilities(uniform_probabilities,
                                        args["uniform_probabilities_filename"])
        logger.verbose(
            "Individual clustering probabilities from file %s, "
            "DNA molecule %s computed", pfam_annotations_file, genome_id)
        del all_domain_pair_indexes

    else:
        logger.verbose(
            "Individual clustering probabilities "
            "from file %s already present", pfam_annotations_file)
Example #9
0
def get_gene_pairs_distances(coordinates_file,
                             gene_to_chr_mapping_file,
                             pfam_annotations_file,
                             GO_to_pfam_mapping,
                             pfam_families,
                             genome_id,
                             genome_size,
                             args,
                             return_annotations=False):
    # Name of the output file to save pairwise distances of GO pairs (optional)
    if args["save_dist"] is True:
        GO_pairs_distances_file_name = \
            file_utilities \
                .new_suffix_file(pfam_annotations_file,
                                 args["pfam_annotations_filename_suffix"],
                                 "_GO" + \
                                 args["pfam_annotations_filename_suffix"],
                                 args["single_distances_dir"],
                                 file_extension="")
        if not os.path.exists(GO_pairs_distances_file_name):
            repeat_file = True
        else:
            repeat_file = False
    else:
        GO_pairs_distances_file_name = None
        repeat_file = True

    # If this file hasn't already been generated in a previous run:
    if repeat_file is True or args["force"] is True:
        logger.verbose("Started computing chromosomal distances of "
                       "GO-related Pfam domain pairs from file %s",
                       pfam_annotations_file)

        # Load the Pfam annotations for the genes in this genome,
        # and organises themaccording to their GO biological process annotations
        gene_coordinates = load_genome_annotations.import_parsed_coordinates(
            coordinates_file)

        domain_hits = load_genome_annotations.import_pfam_hits(
            pfam_annotations_file,
            file_utilities.load_json(
                gene_to_chr_mapping_file),
            args["evalue"],
            pfam_families,
            genome_id)

        # Get list of all gene pairs in the genome with shared GO annotations
        if return_annotations is False:
            GO_pairs = get_GO_pairs_set(domain_hits, GO_to_pfam_mapping)
        else:
            GO_pairs, GO_pairs_annotations = \
                get_GO_pairs_set(domain_hits,
                                 GO_to_pfam_mapping,
                                 return_annotations)

        # Compute pairwise distances for gene pairs with shared GO annotations
        GO_pairs_distances = \
            genomic_pairwise_distances.pairwise_distances(GO_pairs,
                                                          gene_coordinates,
                                                          genome_size)

        # If we want to look at non-GO pairs
        if args["non_go"] is True:
            all_pfam_genes = set(list(itertools.chain(*domain_hits.values())))
            all_pairs = itertools.product(all_pfam_genes,
                                          all_pfam_genes)
            other_pairs = list(set(all_pairs) - set(GO_pairs))
            other_pairs_distances = \
                genomic_pairwise_distances.pairwise_distances(other_pairs,
                                                              gene_coordinates,
                                                              genome_size)
        if len(GO_pairs_distances) == 0:
            logger.verbose("No GO-related domain pairs for file %s available",
                           pfam_annotations_file)
        else:
            # Save results
            if args["save_dist"] is True:
                save_pairwise_distances_file(GO_pairs_distances_file_name,
                                             GO_pairs_distances)

                if args["non_go"] is True:
                    args['non_go_pairs_distances_file_name'] = \
                        GO_pairs_distances_file_name.replace("_GO", "_not_GO")
                    save_pairwise_distances_file(
                        args['non_go_pairs_distances_file_name'],
                        other_pairs_distances)

        logger.verbose("Chromosomal distances of GO-related Pfam domain "
                       "pairs for file %s computed", pfam_annotations_file)

    else:
        np.load(GO_pairs_distances_file_name)
        logger.verbose("Chromosomal distances of GO-related Pfam domain pairs "
                       "for file %s already computed",
                       pfam_annotations_file)

    if args['non_go'] is True:
        if return_annotations is True:
            return GO_pairs_distances, other_pairs_distances, \
                   GO_pairs_annotations
        else:
            return GO_pairs_distances, other_pairs_distances
    else:
        if return_annotations is True:
            return GO_pairs_distances, GO_pairs_annotations
        else:
            return (GO_pairs_distances,)
Example #10
0
def compute_conserved_probabilities(probabilities_files_list,
                                    organisms_uniform_probabilities, n_pairs,
                                    tree, subclade_id,
                                    subclade_final_probabilities_filename,
                                    args):
    """ This function computes the finals conserved clustered probabilities of all
    organisms whose probabilities are listed in probabilities files list.
    """
    if subclade_id == 0:
        subclade_id = "root"

    if args["verbose"] is False:
        logger.setLevel(logging.INFO)

    logger.verbose(
        "Computing final conserved clustering probabilities from subclade %s",
        subclade_id)

    # Initialises empty arrays for the results
    domain_pairs_probabilities = {
        'exp_prob': np.zeros(n_pairs),
        'uni_prob': np.zeros(n_pairs)
    }

    domain_pairs_data = {
        'occurrences': np.zeros(n_pairs, dtype=np.uint32),
        'n_genes1': np.zeros(n_pairs, dtype=np.uint32),
        'n_genes2': np.zeros(n_pairs, dtype=np.uint32),
        'fusions': np.zeros(n_pairs, dtype=np.uint16),
        'n_organisms': np.zeros(n_pairs, dtype=np.uint16)
    }

    if args["weight"] is True:
        organism_weights = GSC.GSC_normalised(tree)
        GSC.save_weights(organism_weights, args["weights_file"], subclade_id,
                         "a")

    for probabilities_filename in probabilities_files_list:
        logger.verbose(
            "Adding individual clustering probabilities from file %s",
            probabilities_filename)

        organism_name = file_utilities.get_organism_from_file(
            probabilities_filename, args["probabilities_filename_suffix"])

        organism_has_pair = np.zeros(n_pairs, dtype=np.uint8)

        if args["weight"] is True:
            organism_weight = organism_weights[organism_name]
        else:
            organism_weight = 1

        organism_probabilities = file_utilities.load_json(
            probabilities_filename)
        for genome_key in organism_probabilities.keys():
            add_organism_probabilities(
                organism_probabilities[genome_key],
                organisms_uniform_probabilities[genome_key], organism_weight,
                domain_pairs_probabilities, domain_pairs_data,
                organism_has_pair)

        domain_pairs_data["n_organisms"] = \
            domain_pairs_data["n_organisms"] + organism_has_pair
        del organism_has_pair

        logger.verbose(
            "Individual clustering probabilities from file %s added",
            probabilities_filename)

    domain_pairs_data["prob"] = compute_final_probabilities(
        domain_pairs_probabilities, args["phi"])
    del domain_pairs_probabilities

    logger.verbose(
        "Saving final conserved clustering probabilities from subclade %s",
        subclade_id)

    prob_files_utilities.\
        save_conserved_probabilities(domain_pairs_data,
                                     subclade_final_probabilities_filename)

    logger.verbose(
        "Final conserved clustering probabilities from "
        "subclade %s saved in file %s", subclade_id,
        subclade_final_probabilities_filename)
def main(args):
    config_file = "global_config.conf"
    args = init_scripts.create_arguments_dict(args, config_file)
    if args["verbose"] is True:
        logger.setLevel(logging.VERBOSE)
    else:
        logger.setLevel(logging.INFO)

    # Check if the input directory exists
    data_dir = os.path.abspath(args["data_dir"])
    if not os.path.exists(data_dir):
        logger.error("Input directory %s does not exist", data_dir)
        raise ValueError("Input directory {} does not exist".format(data_dir))
    else:
        args["data_dir"] = data_dir
        logger.info("Input directory: %s", data_dir)

    init_scripts.check_input_files(args)

    # Check if there is something to run (options)
    parameters = []
    for el in ["fit_model", "predict", "permutations", "bootstrap", "sensitivity"]:
        if args[el] is True:
            parameters.append(el)

    if len(parameters) == 0:
        logger.error("Select any among these options: {}"
                     .format(", --".join(parameters)))
        raise ValueError("Either one of --{} options have to be selected"
                         .format(", --".join(parameters)))
    else:
        logger.info("Clustering analysis will be run in the following mode(s): %s",
                    ", ".join(parameters))

    logger.info("Number of threads: %s", args["threads"])

    # Creates output directories
    init_scripts.create_output_directories(args)

    if args["weight"] is True or args["subclades"] is True:
        if not os.path.exists(args["tree_file"]):
            logger.error("Input phylogenetic tree file %s not found",
                         args["tree_file"])
            raise ValueError("Input phylogenetic tree file {} not found"
                             .format(args["tree_file"]))
        else:
            logger.info("Input phylogenetic tree file: %s", args["tree_file"])

    if args["weight"] is True:
        logger.info("Conserved clustering probabilities will be "
                    "weighted using phylogenetic distances")
    if args["subclades"] is True:
        logger.info("Conserved clustering probabilities will be "
                    "computed for each subclade")
    else:
        logger.info("Conserved clustering probabilities will be "
                    "computed just for the root node")

    # Check if the parsed coordinates files already exist, if not create them from the gbk files
    if args["fit_model"] or args["predict"] is True:
        args["genome_sizes"] = gbk_to_fast_coordinates.parse_coordinates(args)
    else:
        args["genome_sizes"] = load_json(args['genome_sizes_filename'])

    if args["fit_model"] is True:
        args["lambd"], args["phi"] = fit_clustering_model.fit_clustering_model(args)
    else:
        if os.path.exists(args["general_parameters_filename"]):
            args["lambd"], args["phi"] = import_export_parameters\
                .import_parameters(args["general_parameters_filename"])

    if args["bootstrap"] is True:
        bootstrap.bootstrap_parameters(args)

    if args["sensitivity"] is True:
        bootstrap.sensitivity_analysis(args)

    if args["predict"] is True:
        if args["fit_model"] is False:
                logger.info("Imported global mean clustering estimated "
                            "parameter values are lambda={:.3g}, phi={:.3g}"
                    .format(args["lambd"], args["phi"]))
        else:
            logger.info("Using default global mean clustering estimated "
                            "parameter values lambda={:.3g}, phi={:.3g}"
                .format(args["lambd"], args["phi"]))

        predict.predict_clustering_pairs(args)

    if args["permutations"] is True:
        if os.path.exists(args['subclades_dir']):
            permute.predict_permuted_clustering_pairs(args)
        else:
            logger.error("Permutations are accepted only for subclade-specific "
                         "clustering analysis.")
def import_genome_sizes(genome_sizes_file):
    return file_utilities.load_json(genome_sizes_file)