コード例 #1
0
def predictHotspots(pangenome, output, force=False, cpu=1, spot_graph=False, overlapping_match=2, set_size=3,
                    exact_match=1, disable_bar=False):
    # check that given parameters for hotspot computation make sense
    checkParameterLogic(overlapping_match, set_size, exact_match)
    # check for formerly computed stuff, and erase if allowed
    checkPangenomeFormerSpots(pangenome, force)
    # check statuses and load info
    checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needGraph=False, needPartitions=True,
                       needRGP=True, disable_bar=disable_bar)

    # get multigenic gene families
    logging.getLogger().info("Detecting multigenic families...")
    multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"])

    logging.getLogger().info("Detecting hotspots in the pangenome...")

    # predict spots
    spots = makeSpotGraph(pangenome.regions, multigenics, output, spot_graph, overlapping_match, set_size, exact_match)

    if len(spots) == 0:
        logging.getLogger().warning("No spots were detected.")
    else:
        logging.getLogger().info(f"{len(spots)} spots were detected")

    pangenome.addSpots(spots)
    pangenome.status["spots"] = "Computed"
    pangenome.parameters["spots"] = {}
    pangenome.parameters["spots"]["set_size"] = set_size
    pangenome.parameters["spots"]["overlapping_match"] = overlapping_match
    pangenome.parameters["spots"]["exact_match"] = exact_match
コード例 #2
0
def predictRGP(pangenome,
               force=False,
               persistent_penalty=3,
               variable_gain=1,
               min_length=3000,
               min_score=4,
               dup_margin=0.05,
               cpu=1):

    #check statuses and load info
    checkPangenomeFormerRGP(pangenome, force)
    checkPangenomeInfo(pangenome,
                       needAnnotations=True,
                       needFamilies=True,
                       needGraph=False,
                       needPartitions=True)

    logging.getLogger().info("Detecting multigenic families...")
    multigenics = pangenome.get_multigenics(dup_margin)
    logging.getLogger().info("Compute Regions of Genomic Plasticity ...")
    bar = tqdm(pangenome.organisms, unit="genomes")
    for org in bar:
        pangenome.addRegions(
            compute_org_rgp(org, persistent_penalty, variable_gain, min_length,
                            min_score, multigenics))
    logging.getLogger().info(f"Predicted {len(pangenome.regions)} RGP")

    #save parameters and save status
    pangenome.parameters["RGP"] = {}
    pangenome.parameters["RGP"]["persistent_penalty"] = persistent_penalty
    pangenome.parameters["RGP"]["variable_gain"] = variable_gain
    pangenome.parameters["RGP"]["min_length"] = min_length
    pangenome.parameters["RGP"]["min_score"] = min_score
    pangenome.parameters["RGP"]["dup_margin"] = dup_margin
    pangenome.status['predictedRGP'] = "Computed"
コード例 #3
0
ファイル: writeSequences.py プロジェクト: labgem/PPanGGOLiN
def writeSequenceFiles(pangenome, output, fasta=None, anno=None, soft_core=0.95, regions=None, genes=None,
                       gene_families=None, prot_families=None, compress=False, disable_bar=False):
    if not any(x for x in [regions, genes, prot_families, gene_families]):
        raise Exception("You did not indicate what file you wanted to write.")

    needAnnotations = False
    needFamilies = False
    needGraph = False
    needPartitions = False
    needSpots = False
    needRegions = False
    needModules = False

    if any(x is not None for x in [regions, genes, gene_families, prot_families]):
        needAnnotations = True
        needFamilies = True
    if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)):
        needRegions = True
    if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)):
        needPartitions = True
    for x in (genes, gene_families, prot_families):
        if x is not None and 'module_' in x:
            needModules = True

    if not (needAnnotations or needFamilies or needGraph or needPartitions or needSpots or needRegions or needModules):
        # then nothing is needed, then something is wrong.
        # find which filter was provided
        provided_filter = ''
        if genes is not None:
            provided_filter = genes
        if gene_families is not None:
            provided_filter = gene_families
        if prot_families is not None:
            provided_filter = prot_families
        if regions is not None:
            provided_filter = regions
        raise Exception(
            f"The filter that you indicated '{provided_filter}' was not understood by PPanGGOLiN. {poss_values_log}")
    ex_geneSequences = Exception("The provided pangenome has no gene sequences. "
                                 "This is not compatible with any of the following options : --genes, --gene_families")
    ex_geneFamilySequences = Exception("The provided pangenome has no gene families. "
                                       "This is not compatible with any of the following options : "
                                       "--prot_families, --gene_families")
    if not pangenome.status["geneSequences"] in ["inFile"] and (genes or gene_families):
        raise ex_geneSequences
    if not pangenome.status["geneFamilySequences"] in ["Loaded", "Computed", "inFile"] and prot_families:
        raise ex_geneFamilySequences

    checkPangenomeInfo(pangenome, needAnnotations=needAnnotations, needFamilies=needFamilies, needGraph=needGraph,
                       needPartitions=needPartitions, needRGP=needRegions, needSpots=needSpots, needModules=needModules,
                       disable_bar=disable_bar)

    if prot_families is not None:
        writeFastaProtFam(pangenome, output, compress, prot_families, soft_core=soft_core, disable_bar=disable_bar)
    if gene_families is not None:
        writeFastaGeneFam(pangenome, output, compress, gene_families, soft_core=soft_core, disable_bar=disable_bar)
    if genes is not None:
        writeGeneSequences(pangenome, output, compress, genes, soft_core=soft_core, disable_bar=disable_bar)
    if regions is not None:
        writeRegionsSequences(pangenome, output, compress, regions, fasta, anno, disable_bar=disable_bar)
コード例 #4
0
ファイル: cluster.py プロジェクト: vinisalazar/PPanGGOLiN
def readClustering(pangenome,
                   families_tsv_file,
                   infer_singletons=False,
                   force=False):
    """
        Creates the pangenome, the gene families and the genes with an associated gene family.
        Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome.
    """
    checkPangenomeFormerClustering(pangenome, force)
    checkPangenomeInfo(pangenome, needAnnotations=True)

    logging.getLogger().info("Reading " + families_tsv_file +
                             " the gene families file ...")
    filesize = os.stat(families_tsv_file).st_size
    families_tsv_file = read_compressed_or_not(families_tsv_file)
    frag = False
    #the genome annotations are necessarily loaded.
    nbGeneWtFam = 0
    bar = tqdm(total=filesize, unit="bytes")
    for line in families_tsv_file:
        bar.update(len(line))
        elements = [el.strip()
                    for el in line.split()]  # 2 or 3 fields expected
        if len(elements) <= 1:
            logging.getLogger().error(
                "No tabulation separator found in gene families file")
            exit(1)
        (fam_id, gene_id,
         is_frag) = elements if len(elements) == 3 else elements + [None]

        geneObj = pangenome.getGene(gene_id)
        if geneObj is not None:
            nbGeneWtFam += 1
            fam = pangenome.addGeneFamily(fam_id)
            geneObj.is_fragment = True if is_frag == "F" else False
            fam.addGene(geneObj)
        if is_frag == "F":
            frag = True
    bar.close()
    families_tsv_file.close()
    if nbGeneWtFam < len(
            pangenome.genes):  #not all genes have an associated cluster
        if nbGeneWtFam == 0:
            raise Exception(
                "No gene ID in the cluster file matched any gene ID from the annotation step. Please ensure that the annotations that you loaded previously and the clustering results that you have use the same gene IDs."
            )
        else:
            if infer_singletons:
                inferSingletons(pangenome)
            else:
                raise Exception(
                    "Some genes did not have an associated cluster. Either change your cluster file so that each gene has a cluster, or use the --infer_singletons option to infer a cluster for each non-clustered gene."
                )
    pangenome.status["genesClustered"] = "Computed"
    if frag:  #if there was fragment informations in the file.
        pangenome.status["defragmented"] = "Computed"
    pangenome.parameters["cluster"] = {}
    pangenome.parameters["cluster"]["read_clustering_from_file"] = True
    pangenome.parameters["cluster"]["infer_singletons"] = infer_singletons
コード例 #5
0
ファイル: writeFlat.py プロジェクト: sletort/PPanGGOLiN
def writeFlatFiles(pangenome, output, cpu = 1, soft_core = 0.95, dup_margin = 0.05, csv=False, genePA = False, gexf = False, light_gexf = False, projection = False, stats = False, json = False, partitions=False,regions = False, families_tsv = False, spots = False, borders=False, compress = False):

    if not any(x for x in [csv, genePA, gexf, light_gexf, projection, stats, json, partitions, regions, spots, borders, families_tsv]):
        raise Exception("You did not indicate what file you wanted to write.")

    global pan
    pan = pangenome
    processes = []
    needAnnotations = False
    needFamilies = False
    needGraph = False
    needPartitions = False
    needSpots = False
    needRegions = False

    if csv or genePA or gexf or light_gexf or projection or stats or json or partitions or regions or spots or families_tsv or borders:
        needAnnotations = True 
    if csv or genePA or gexf or light_gexf or projection or stats or json or partitions or regions or spots or families_tsv  or borders:
        needFamilies = True
    if projection or stats or partitions or regions or spots or borders:
        needPartitions = True
    if gexf or light_gexf or json:
        needGraph = True
    if regions or spots or borders:
        needRegions = True
    if spots or borders:
        needSpots = True

    checkPangenomeInfo(pan, needAnnotations=needAnnotations, needFamilies=needFamilies, needGraph=needGraph, needPartitions= needPartitions, needRGP = needRegions, needSpots = needSpots)
    pan.getIndex()#make the index because it will be used most likely
    with Pool(processes = cpu) as p:
        if csv:
            processes.append(p.apply_async(func = writeMatrix, args = (',', "csv", output, compress, True)))
        if genePA:
            processes.append(p.apply_async(func = writeGenePresenceAbsence, args = (output, compress)))
        if gexf:
            processes.append(p.apply_async(func = writeGEXF, args = (output, False, soft_core, compress)))
        if light_gexf:
            processes.append(p.apply_async(func = writeGEXF, args = (output, True, soft_core, compress)))
        if projection:
            processes.append(p.apply_async(func = writeProjections, args = (output, compress)))
        if stats:
            processes.append(p.apply_async(func = writeStats, args = (output, soft_core, dup_margin, compress)))
        if json:
            processes.append(p.apply_async(func = writeJSON, args = (output, compress)))
        if partitions:
            processes.append(p.apply_async(func = writeParts, args = (output, soft_core, compress)))
        if families_tsv:
            processes.append(p.apply_async(func = writeGeneFamiliesTSV, args = (output, compress)))
        if regions:
            processes.append(p.apply_async(func = writeRegions, args = (output, compress)))
        if spots:
            processes.append(p.apply_async(func = writeSpots, args=(output, compress)))
        if borders:
            processes.append(p.apply_async(func=writeBorders, args=(output, dup_margin, compress)))

        for process in processes:
            process.get()#get all the results
コード例 #6
0
def align(pangenome,
          proteinFile,
          output,
          tmpdir,
          identity=0.8,
          coverage=0.8,
          defrag=False,
          cpu=1,
          getinfo=False,
          draw_related=False):
    if pangenome.status["geneFamilySequences"] not in [
            "inFile", "Loaded", "Computed"
    ]:
        raise Exception(
            "Cannot use this function as your pangenome does not have gene families representatives associated to it. For now this works only if the clustering is realised by PPanGGOLiN."
        )
    if getinfo:
        checkPangenomeInfo(pangenome,
                           needAnnotations=True,
                           needFamilies=True,
                           needRGP=True,
                           needPartitions=True,
                           needSpots=True)
    else:
        checkPangenomeInfo(pangenome, needFamilies=True)

    newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
    tmpPangFile = tempfile.NamedTemporaryFile(mode="w", dir=newtmpdir.name)

    writeGeneFamSequences(pangenome, tmpPangFile)

    with read_compressed_or_not(proteinFile) as protFileObj:
        protSet = getProt(protFileObj)
        alignFile = alignSeqToPang(tmpPangFile, protFileObj, output, newtmpdir,
                                   cpu, defrag, identity, coverage)

    prot2pang = readAlignments(alignFile, pangenome)

    if getinfo:
        getProtInfo(prot2pang, pangenome, output, cpu, draw_related)
    else:
        partProj = projectPartition(
            prot2pang, protSet, output)  #write the partition assignation only
        logging.getLogger().info(
            f"proteins partition projection : '{partProj}'")
    logging.getLogger().info(
        f"{len(prot2pang)} proteins over {len(protSet)} have at least one hit in the pangenome."
    )
    logging.getLogger().info(
        f"Blast-tab file of the alignment : '{alignFile}'")

    tmpPangFile.close()
    newtmpdir.cleanup()
コード例 #7
0
def drawSpots(pangenome, output, spot_list, disable_bar):
    # check that the pangenome has spots
    checkPredictedSpots(pangenome)

    needMod = False
    if pangenome.status["modules"] != "No":
        # modules are not required to be loaded, but if they have been computed we load them.
        needMod = True

    checkPangenomeInfo(pangenome,
                       needAnnotations=True,
                       needFamilies=True,
                       needGraph=False,
                       needPartitions=True,
                       needRGP=True,
                       needSpots=True,
                       needModules=needMod,
                       disable_bar=disable_bar)

    selected_spots = set()
    curated_spot_list = [
        'spot_' + str(s) if 'spot' not in s else str(s)
        for s in spot_list.split(',')
    ]

    if spot_list == 'all' or any(x == 'all' for x in curated_spot_list):
        selected_spots = [
            s for s in pangenome.spots if len(s.getUniqOrderedSet()) > 1
        ]
    else:
        selected_spots = [
            s for s in pangenome.spots
            if "spot_" + str(s.ID) in curated_spot_list
        ]
    if len(selected_spots) < 10:
        logging.getLogger().info(
            f"Drawing the following spots: {','.join(['spot_' + str(s.ID) for s in selected_spots])}"
        )
    else:
        logging.getLogger().info(f"Drawing {len(selected_spots)} spots")

    drawSelectedSpots(
        selected_spots,
        pangenome,
        output,
        overlapping_match=pangenome.parameters["spots"]["overlapping_match"],
        exact_match=pangenome.parameters["spots"]["exact_match"],
        set_size=pangenome.parameters["spots"]["set_size"],
        disable_bar=disable_bar)
コード例 #8
0
def genomes_fluidity(pangenome, disable_bar=False):
    """ Compute the genomes fluidity from the pangenome

    :param pangenome: pangenome which will be used to compute the genomes fluidity
    :type pangenome: Pangenome
    :param disable_bar: Disable the progress bar
    :type disable_bar: bool

    :return: Genomes fluidity value from the pangenome
    :rtype:float
    """

    # check statuses and load info
    logging.getLogger().info("Check information in pangenome")
    checkPangenomeInfo(pangenome,
                       needAnnotations=True,
                       needFamilies=True,
                       disable_bar=disable_bar)
    fluidity_dict = {
        'all': None,
        'shell': None,
        'cloud': None,
        'accessory': None
    }
    for subset in fluidity_dict.keys():
        logging.getLogger().debug(f"Compute binaries for {subset} partition")
        pangenome.compute_org_bitarrays(part=subset)
        # Compute binaries corresponding to presence / absence of families in organisms
        g_sum = 0
        logging.getLogger().debug("Get number of families in each organisms")
        org2_nb_fam = nb_fam_per_org(pangenome, disable_bar)
        logging.getLogger().info(
            f"Compute rate of unique family for each genome combination in {subset}"
        )
        for c_organisms in tqdm(list(combinations(pangenome.organisms, 2)),
                                unit="combination",
                                disable=disable_bar):
            tot_fam = org2_nb_fam.get(c_organisms[0].name) + org2_nb_fam.get(
                c_organisms[1].name)
            common_fam = popcount(c_organisms[0].bitarray
                                  & c_organisms[1].bitarray) - 1
            if tot_fam > 0 and common_fam > 0:
                g_sum += (tot_fam - 2 * common_fam) / tot_fam
        fluidity_dict[subset] = (
            2 / (pangenome.number_of_organisms() *
                 (pangenome.number_of_organisms() - 1))) * g_sum
    return fluidity_dict
コード例 #9
0
def predictHotspots(pangenome, output, force=False, cpu = 1, spot_graph = False, overlapping_match = 2, set_size = 3, exact_match = 1, draw_hotspot = False, interest = ""):
    
    #check that given parameters for hotspot computation make sense
    checkParameterLogic(overlapping_match, set_size, exact_match)
    #check for formerly computed stuff, and erase if allowed
    checkPangenomeFormerSpots(pangenome, force)
    #check statuses and load info
    checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needGraph=False, needPartitions = True, needRGP=True)

    #get multigenic gene families
    logging.getLogger().info("Detecting multigenic families...")
    multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"])
    
    logging.getLogger().info("Detecting hotspots in the pangenome...")

    #predict spots
    spots = makeSpotGraph(pangenome.regions, multigenics, output, spot_graph, overlapping_match, set_size, exact_match)

    #define elements of interest (e.g. gene name, product substring) to search in gene annotations
    if interest != "":
        elements = [ el.strip() for el in interest.split(',') ]
    else:
        elements = []

    #draw spots of interest
    if draw_hotspot:
        drawn_spots = select_spots(pangenome, spots, elements)
        if len(drawn_spots)>0:
            draw_spots(drawn_spots, output, cpu, overlapping_match, exact_match, set_size, multigenics, elements)

    pangenome.addSpots(spots)
    pangenome.status["spots"] = "Computed"
    pangenome.parameters["spots"] = {}
    pangenome.parameters["spots"]["set_size"] = set_size
    pangenome.parameters["spots"]["overlapping_match"] = overlapping_match
    pangenome.parameters["spots"]["exact_match"] = exact_match
コード例 #10
0
ファイル: writeMSA.py プロジェクト: tauqeer9/PPanGGOLiN
def writeMSAFiles(pangenome,
                  output,
                  cpu=1,
                  partition="core",
                  tmpdir="/tmp",
                  source="protein",
                  force=False,
                  show_bar=True):

    needPartitions = False
    if partition in ["persistent", "shell", "cloud"]:
        needPartitions = True

    outname = output + f"/msa_{partition}_{source}/"
    mkOutdir(outname, force=force)

    checkPangenomeInfo(pangenome,
                       needAnnotations=True,
                       needFamilies=True,
                       needPartitions=needPartitions,
                       needGeneSequences=True,
                       show_bar=show_bar)
    logging.getLogger().info(f"computing msa for {partition} families...")
    families = getFamiliesToWrite(pangenome, partitionFilter=partition)

    #this must exist since we loaded the pangenome and families are required
    code = pangenome.parameters["cluster"]["translation_table"]

    computeMSA(families,
               outname,
               cpu=cpu,
               tmpdir=tmpdir,
               source=source,
               code=code,
               show_bar=show_bar)
    logging.getLogger().info(f"Done writing all {partition} MSA in: {outname}")
コード例 #11
0
def projectRGP(pangenome,
               annotation,
               output,
               tmpdir,
               identity=0.8,
               coverage=0.8,
               defrag=False,
               cpu=1,
               translation_table=11):
    if pangenome.status["geneFamilySequences"] not in [
            "inFile", "Loaded", "Computed"
    ]:
        raise Exception(
            "Cannot use this function as your pangenome does not have gene families representatives associated to it. For now this works only if the clustering is realised by PPanGGOLiN."
        )

    #read given file
    logging.getLogger().info("Retrieving the annotations from the given file")
    singleOrgPang = Pangenome(
    )  #need to create a new 'pangenome' as the annotation reading functions take a pangenome as input.
    filetype = detect_filetype(annotation)
    if filetype == "gff":
        singleOrgPang.status[
            "geneSequences"] = "Computed"  #if there are no sequences in the gff, this value will change to 'No'
        read_org_gff(singleOrgPang, 'myGenome', annotation, [], True)
        if singleOrgPang.status["geneSequences"] == "No":
            raise Exception(
                f"The given annotation file did not have a FASTA sequence included (expected '##FASTA' pragma followed by a fasta-like file format). This is required for computing the Regions of Genomic Plasticity of your organism"
            )
    elif filetype == "gbff":
        read_org_gbff(singleOrgPang, 'myGenome', annotation, [], True)

    #check and read given pangenome
    checkPangenomeInfo(pangenome,
                       needFamilies=True,
                       needPartitions=True,
                       needAnnotations=True)

    newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir)
    tmpPangFile = tempfile.NamedTemporaryFile(mode="w", dir=newtmpdir.name)
    tmpGeneFile = tempfile.NamedTemporaryFile(mode="w", dir=newtmpdir.name)

    writeGeneSequencesFromAnnotations(singleOrgPang, tmpGeneFile)
    writeGeneFamSequences(pangenome, tmpPangFile)

    blastout = alignSeqToPang(tmpPangFile, tmpGeneFile, output, newtmpdir, cpu,
                              defrag, identity, coverage, True,
                              translation_table)

    tmpPangFile.close()
    tmpGeneFile.close()
    newtmpdir.cleanup()
    #artificially reconstruct the gene families and their partitions
    linkNewGenomeFamilies(singleOrgPang, pangenome, blastout)

    multigenics = pangenome.get_multigenics(
        pangenome.parameters["RGP"]["dup_margin"])
    genomeMultigenics = linkMultigenicFamilies(singleOrgPang, multigenics)

    logging.getLogger().info("Predicting RGP in your genome")
    for org in singleOrgPang.organisms:
        genomeRGP = compute_org_rgp(
            org, pangenome.parameters["RGP"]["persistent_penalty"],
            pangenome.parameters["RGP"]["variable_gain"],
            pangenome.parameters["RGP"]["min_length"],
            pangenome.parameters["RGP"]["min_score"], genomeMultigenics)

    if filetype == "gff":
        #reread the file and insert sequence_feature objects corresponding to the predicted regions
        logging.getLogger().info("Writing the RGP in a gff file...")
        writeGffRegions(annotation, genomeRGP, output)
    elif filetype == "gbff":
        logging.getLogger().info("Writing the RGP in a gbff file...")
        writeGbffRegions(annotation, genomeRGP, output)
コード例 #12
0
ファイル: tile_plot.py プロジェクト: vinisalazar/PPanGGOLiN
def drawTilePlot(pangenome, output, nocloud=False):
    checkPangenomeInfo(pangenome,
                       needAnnotations=True,
                       needFamilies=True,
                       needGraph=True)
    if pangenome.status["partitionned"] == "No":
        raise Exception(
            "Cannot draw the tile plot as your pangenome has not been partitionned"
        )
    if len(pangenome.organisms) > 500 and nocloud is False:
        logging.getLogger().warning(
            "You asked to draw a tile plot for a lot of organisms (>500). Your browser will probably not be able to open it."
        )
    logging.getLogger().info("Drawing the tile plot...")
    data = []
    all_indexes = []
    all_columns = []
    fam2index = {}
    index2fam = {}
    if nocloud:
        families = {
            fam
            for fam in pangenome.geneFamilies
            if not fam.partition.startswith("C")
        }
    else:
        families = set(pangenome.geneFamilies)
    org_index = pangenome.getIndex()
    index2org = {}
    for org, index in org_index.items():
        index2org[index] = org
    COLORS = {
        "pangenome": "black",
        "exact_accessory": "#EB37ED",
        "exact_core": "#FF2828",
        "soft_core": "#c7c938",
        "soft_accessory": "#996633",
        "shell": "#00D860",
        "persistent": "#F7A507",
        "cloud": "#79DEFF",
        "undefined": "#828282"
    }

    logging.getLogger().info("start with matrice")

    for row, fam in enumerate(families):
        new_col = [org_index[org] for org in fam.organisms]
        all_indexes.extend([row] * len(new_col))
        all_columns.extend(new_col)
        data.extend([1.0] * len(new_col))
        index2fam[row] = fam.name
        fam2index[fam.name] = row

    mat_p_a = csc_matrix((data, (all_indexes, all_columns)),
                         shape=(len(families), len(pangenome.organisms)),
                         dtype='float')
    dist = pdist(1 - jaccard_similarities(mat_p_a, 0).todense())
    hc = linkage(dist, 'single')

    dendro = dendrogram(hc, no_plot=True)
    logging.getLogger().info(
        "done with making the dendrogram to order the organisms on the plot")

    order_organisms = [index2org[index] for index in dendro["leaves"]]

    binary_data = []
    text_data = []
    fam_order = []
    partitions_dict = defaultdict(list)
    shell_subs = set()
    for fam in families:
        partitions_dict[fam.partition].append(fam)
        if fam.partition.startswith("S"):
            shell_subs.add(
                fam.partition
            )  #number of elements will tell the number of subpartitions
    ordered_nodes = []
    ordored_nodes_p = sorted(partitions_dict["P"],
                             key=lambda n: len(n.organisms),
                             reverse=True)
    ordored_nodes_c = sorted(partitions_dict["C"],
                             key=lambda n: len(n.organisms),
                             reverse=True)
    sep_p = len(ordored_nodes_p) - 0.5
    separators = [sep_p]
    shell_NA = None
    if len(shell_subs) == 1:
        ordored_nodes_s = sorted(partitions_dict[shell_subs.pop()],
                                 key=lambda n: len(n.organisms),
                                 reverse=True)
        ordered_nodes = ordored_nodes_p + ordored_nodes_s + ordored_nodes_c
        separators.append(separators[len(separators) - 1] +
                          len(ordored_nodes_s))
        separators.append(separators[len(separators) - 1] +
                          len(ordored_nodes_c))
    else:
        ordered_nodes = ordored_nodes_p
        for subpartition in sorted(shell_subs):
            if subpartition == "S_":
                shell_NA = len(separators) - 1
            ordored_nodes_s = sorted(partitions_dict[subpartition],
                                     key=lambda n: len(n.organisms),
                                     reverse=True)
            ordered_nodes += ordored_nodes_s
            separators.append(separators[len(separators) - 1] +
                              len(ordored_nodes_s))
        ordered_nodes += ordored_nodes_c
        separators.append(separators[len(separators) - 1] +
                          len(ordored_nodes_c))

    logging.getLogger().info(
        "Getting the gene name(s) and the number for each tile of the plot ..."
    )
    for node in ordered_nodes:
        fam_order.append('\u200c' + node.name)
        data = node.organisms
        binary_data.append([
            len(node.getGenesPerOrg(org)) if org in data else numpy.nan
            for org in order_organisms
        ])
        text_data.append([("\n".join(map(str, node.getGenesPerOrg(org))))
                          if org in data else numpy.nan
                          for org in order_organisms])

    xaxis_values = ['\u200c' + org.name for org in order_organisms]

    logging.getLogger().info(
        "Done extracting names and numbers. Making the heatmap ...")

    heatmap = go.Heatmap(z=binary_data,
                         x=xaxis_values,
                         y=fam_order,
                         text=text_data,
                         zauto=False,
                         zmin=1,
                         zmax=2,
                         autocolorscale=False,
                         colorscale=[[0.50, 'rgb(100, 15, 78)'],
                                     [1, 'rgb(59, 157, 50)']],
                         colorbar=dict(title='Presence/Absence',
                                       titleside='top',
                                       tickmode='array',
                                       tickvals=[1, 2],
                                       ticktext=['Presence', 'Multicopy'],
                                       ticks='outside'))
    shell_color = None
    if len(shell_subs) > 1:
        if "S_" not in shell_subs:
            shell_color = cl.interp(cl.flipper()['seq']['9']['Greens'][1:7],
                                    len(shell_subs))
        else:
            shell_color = cl.interp(cl.flipper()['seq']['9']['Greens'][1:7],
                                    len(shell_subs) - 1)
    shapes = []
    sep_prec = 0
    for nb, sep in enumerate(separators):
        color = None
        if nb == 0:
            color = COLORS["persistent"]
        elif nb == (len(separators) - 1):
            color = COLORS["cloud"]
        elif len(shell_subs) > 1:
            if shell_NA is not None and nb == shell_NA:
                color = COLORS["shell"]
            else:
                color = shell_color.pop()
        else:
            color = COLORS["shell"]
        shapes.append(
            dict(type='line',
                 x0=-1,
                 x1=-1,
                 y0=sep_prec,
                 y1=sep,
                 line=dict(dict(width=10, color=color))))
        shapes.append(
            dict(type='line',
                 x0=len(pangenome.organisms),
                 x1=len(pangenome.organisms),
                 y0=sep_prec,
                 y1=sep,
                 line=dict(dict(width=10, color=color))))
        shapes.append(
            dict(type='line',
                 x0=-1,
                 x1=len(pangenome.organisms),
                 y0=sep,
                 y1=sep,
                 line=dict(dict(width=1, color=color))))
        sep_prec = sep

    layout = go.Layout(title="presence/absence matrix",
                       xaxis=go.layout.XAxis(ticktext=xaxis_values,
                                             title='organisms',
                                             tickvals=xaxis_values,
                                             automargin=True,
                                             tickfont=dict(size=10)),
                       yaxis=go.layout.YAxis(ticktext=fam_order,
                                             tickvals=fam_order,
                                             title='gene families',
                                             automargin=True,
                                             tickfont=dict(size=10)),
                       shapes=shapes,
                       plot_bgcolor='#ffffff')
    logging.getLogger().info("Drawing the figure itself...")
    out_plotly.plot(go.Figure(data=[heatmap], layout=layout),
                    filename=output + "/tile_plot.html",
                    auto_open=False)
    logging.getLogger().info(
        f"Done with the tile plot : '{output+'/tile_plot.html'}' ")
コード例 #13
0
def drawUCurve(pangenome, output, soft_core=0.95, disable_bar=False):
    checkPangenomeInfo(pangenome,
                       needAnnotations=True,
                       needFamilies=True,
                       needGraph=True,
                       disable_bar=disable_bar)
    logging.getLogger().info("Drawing the U-shaped curve...")
    max_bar = 0
    count = defaultdict(lambda: defaultdict(int))
    is_partitioned = False
    has_undefined = False
    for fam in pangenome.geneFamilies:
        nb_org = len(fam.organisms)
        if fam.partition != "":
            is_partitioned = True
            if fam.partition == "U":
                has_undefined = True
            count[nb_org][fam.namedPartition] += 1
        count[nb_org]["pangenome"] += 1
        max_bar = count[nb_org][
            "pangenome"] if count[nb_org]["pangenome"] > max_bar else max_bar
    data_plot = []
    chao = "NA"
    if count[1]["pangenome"] > 0:
        chao = round(
            len(pangenome.geneFamilies) + ((count[0]["pangenome"] ^ 2) /
                                           (count[1]["pangenome"] * 2)), 2)
    COLORS = {
        "pangenome": "black",
        "exact_accessory": "#EB37ED",
        "exact_core": "#FF2828",
        "soft_core": "#c7c938",
        "soft_accessory": "#996633",
        "shell": "#00D860",
        "persistent": "#F7A507",
        "cloud": "#79DEFF",
        "undefined": "#828282"
    }

    if is_partitioned and not has_undefined:
        persistent_values = []
        shell_values = []
        cloud_values = []
        for nb_org in range(1, len(pangenome.organisms) + 1):
            persistent_values.append(count[nb_org]["persistent"])
            shell_values.append(count[nb_org]["shell"])
            cloud_values.append(count[nb_org]["cloud"])
        data_plot.append(
            go.Bar(x=list(range(1,
                                len(pangenome.organisms) + 1)),
                   y=persistent_values,
                   name='persistent',
                   marker=dict(color=COLORS["persistent"])))
        data_plot.append(
            go.Bar(x=list(range(1,
                                len(pangenome.organisms) + 1)),
                   y=shell_values,
                   name='shell',
                   marker=dict(color=COLORS["shell"])))
        data_plot.append(
            go.Bar(x=list(range(1,
                                len(pangenome.organisms) + 1)),
                   y=cloud_values,
                   name='cloud',
                   marker=dict(color=COLORS["cloud"])))
    else:
        text = 'undefined' if has_undefined else "pangenome"
        undefined_values = []
        for nb_org in range(1, len(pangenome.organisms) + 1):
            undefined_values.append(count[nb_org][text])
        data_plot.append(
            go.Bar(x=list(range(1,
                                len(pangenome.organisms) + 1)),
                   y=undefined_values,
                   name=text,
                   marker=dict(color=COLORS[text])))
    x = len(pangenome.organisms) * soft_core
    layout = go.Layout(
        title="Gene families frequency distribution (U shape), chao=" +
        str(chao),
        xaxis=dict(title='Occurring in x genomes'),
        yaxis=dict(title='# of gene families (F)'),
        barmode='stack',
        shapes=[
            dict(type='line',
                 x0=x,
                 x1=x,
                 y0=0,
                 y1=max_bar,
                 line=dict(dict(width=5, dash='dashdot', color="grey")))
        ],
        plot_bgcolor='#ffffff')

    fig = go.Figure(data=data_plot, layout=layout)
    out_plotly.plot(fig,
                    filename=output + "/Ushaped_plot.html",
                    auto_open=False)
    logging.getLogger().info(
        f"Done drawing the U-shaped curve : '{output + '/Ushaped_plot.html'}'")
コード例 #14
0
ファイル: partition.py プロジェクト: pythseq/PPanGGOLiN
def partition(pangenome,
              tmpdir,
              outputdir=None,
              beta=2.5,
              sm_degree=10,
              free_dispersion=False,
              chunk_size=500,
              K=-1,
              Krange=None,
              ICL_margin=0.05,
              draw_ICL=False,
              cpu=1,
              seed=42,
              keep_tmp_files=False):

    Krange = Krange or [3, 20]
    global pan
    global samples
    pan = pangenome

    if draw_ICL and outputdir is None:
        raise Exception(
            "Combination of option impossible: You asked to draw the ICL curves but did not provide an output directory!"
        )

    checkPangenomeInfo(pangenome,
                       needAnnotations=True,
                       needFamilies=True,
                       needGraph=True)
    organisms = set(pangenome.organisms)

    tmpdirObj = tempfile.TemporaryDirectory(dir=tmpdir)
    tmpdir = tmpdirObj.name

    if len(organisms) <= 10:
        logging.getLogger().warning(
            f"The number of selected organisms is too low ({len(organisms)} organisms used) to robustly partition the graph"
        )

    pangenome.parameters["partition"] = {}
    pangenome.parameters["partition"]["beta"] = beta
    pangenome.parameters["partition"]["free_dispersion"] = free_dispersion
    pangenome.parameters["partition"][
        "max_node_degree_for_smoothing"] = sm_degree
    if len(organisms) > chunk_size:
        pangenome.parameters["partition"]["chunk_size"] = chunk_size
    pangenome.parameters["partition"]["computed_K"] = False

    if K < 3:
        pangenome.parameters["partition"]["computed_K"] = True
        logging.getLogger().info(
            "Estimating the optimal number of partitions...")
        K = evaluate_nb_partitions(organisms, sm_degree, free_dispersion,
                                   chunk_size, Krange, ICL_margin, draw_ICL,
                                   cpu, tmpdir, seed, outputdir)
        logging.getLogger().info(
            f"The number of partitions has been evaluated at {K}")

    pangenome.parameters["partition"]["K"] = K
    init = "param_file"

    partitionning_results = {}

    families = set()
    cpt = 0
    cpt_partition = {}
    random.seed(seed)

    for fam in pangenome.geneFamilies:
        families.add(fam)
        if chunk_size < len(organisms):
            cpt_partition[fam.name] = {"P": 0, "S": 0, "C": 0, "U": 0}

    start_partitionning = time.time()
    logging.getLogger().info("Partitioning...")
    pansize = len(families)
    if chunk_size < len(organisms):
        validated = set()

        def validate_family(result):
            for node, nem_class in result[0].items():
                cpt_partition[node][nem_class[0]] += 1
                sum_partionning = sum(cpt_partition[node].values())
                if (sum_partionning > len(organisms) / chunk_size and
                        max(cpt_partition[node].values()) >= sum_partionning *
                        0.5) or (sum_partionning > len(organisms)):
                    if node not in validated:
                        if max(cpt_partition[node].values()
                               ) < sum_partionning * 0.5:
                            cpt_partition[node]["U"] = len(
                                organisms
                            )  #if despite len(select_organisms) partionning, an abosolute majority is not found then the families is set to undefined
                        validated.add(node)

        org_nb_sample = Counter()
        for org in organisms:
            org_nb_sample[org] = 0
        condition = len(organisms) / chunk_size
        while len(validated) < pansize:
            prev = len(
                samples
            )  #if we've been sampling already, samples is not empty.
            while not all(
                    val >= condition for val in org_nb_sample.values()
            ):  #each family must be tested at least len(select_organisms)/chunk_size times.
                shuffled_orgs = list(organisms)  #copy select_organisms
                random.shuffle(shuffled_orgs)  #shuffle the copied list
                while len(shuffled_orgs) > chunk_size:
                    samples.append(set(shuffled_orgs[:chunk_size]))
                    for org in samples[-1]:
                        org_nb_sample[org] += 1
                    shuffled_orgs = shuffled_orgs[chunk_size:]
            args = []
            # tmpdir, beta, sm_degree, free_dispersion, K, seed
            for i, _ in enumerate(samples[prev:], start=prev):
                args.append((i, tmpdir, beta, sm_degree, free_dispersion, K,
                             seed, init, keep_tmp_files))

            logging.getLogger().info("Launching NEM")
            with Pool(processes=cpu) as p:
                #launch partitionnings
                bar = tqdm(range(len(args)), unit=" samples partitionned")
                for result in p.imap_unordered(nemSamples, args):
                    validate_family(result)
                    bar.update()

                bar.close()
                condition += 1  #if len(validated) < pan_size, we will want to resample more.
                logging.getLogger().debug(
                    f"There are {len(validated)} validated families out of {pansize} families."
                )
                p.close()
                p.join()
        for fam, data in cpt_partition.items():
            partitionning_results[fam] = max(data, key=data.get)

        ## need to compute the median vectors of each partition ???
        partitionning_results = [partitionning_results,
                                 []]  ##introduces a 'non feature'.

        logging.getLogger().info(
            f"Did {len(samples)} partitionning with chunks of size {chunk_size} among {len(organisms)} genomes in {round(time.time() - start_partitionning,2)} seconds."
        )
    else:
        edges_weight, nb_fam = write_nem_input_files(tmpdir + "/" + str(cpt) +
                                                     "/",
                                                     organisms,
                                                     sm_degree=sm_degree)
        partitionning_results = run_partitioning(tmpdir + "/" + str(cpt) + "/",
                                                 len(organisms),
                                                 beta *
                                                 (nb_fam / edges_weight),
                                                 free_dispersion,
                                                 K=K,
                                                 seed=seed,
                                                 init=init,
                                                 keep_files=keep_tmp_files)
        if partitionning_results == [{}, None, None]:
            raise Exception(
                "Statistical partitionning does not work on your data. This usually happens because you used very few (<15) genomes."
            )
        cpt += 1
        logging.getLogger().info(
            f"Partitionned {len(organisms)} genomes in {round(time.time() - start_partitionning,2)} seconds."
        )

    # pangenome.savePartitionParameters(K, beta, free_dispersion, sm_degree, partitionning_results[1], chunk_size)

    for famName, partition in partitionning_results[0].items():
        pangenome.getGeneFamily(famName).partition = partition

    pangenome.status["partitionned"] = "Computed"
    if not keep_tmp_files:
        tmpdirObj.cleanup()
    else:
        copytree(tmpdir, outputdir + "/NEM_files/")
コード例 #15
0
ファイル: rarefaction.py プロジェクト: tauqeer9/PPanGGOLiN
def makeRarefactionCurve( pangenome, output, tmpdir, beta=2.5, depth = 30, minSampling =1, maxSampling = 100, sm_degree = 10, free_dispersion=False, chunk_size = 500, K=-1, cpu = 1, seed=42, kestimate = False, krange = [3,-1], soft_core = 0.95, show_bar=True):

    ppp.pan = pangenome#use the global from partition to store the pangenome, so that it is usable

    try:
        krange[0] = ppp.pan.parameters["partition"]["K"] if krange[0]<0 else krange[0]
        krange[1] = ppp.pan.parameters["partition"]["K"] if krange[1]<0 else krange[1]
    except KeyError:
        krange=[3,20]
    checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needGraph=True, show_bar=show_bar)

    tmpdirObj = tempfile.TemporaryDirectory(dir=tmpdir)
    tmpdir = tmpdirObj.name

    if float(len(pangenome.organisms)) < maxSampling:
        maxSampling = len(pangenome.organisms)
    else:
        maxSampling = int(maxSampling)

    if K < 3 and kestimate is False:#estimate K once and for all.
        try:
            K = ppp.pan.parameters["partition"]["K"]
            logging.getLogger().info(f"Reuse the number of partitions {K}")
        except KeyError:
            logging.getLogger().info("Estimating the number of partitions...")
            K = ppp.evaluate_nb_partitions(pangenome.organisms, sm_degree, free_dispersion, chunk_size, krange, 0.05, False, cpu, tmpdir, seed, None)
            logging.getLogger().info(f"The number of partitions has been evaluated at {K}")

    logging.getLogger().info("Extracting samples ...")
    AllSamples = []
    for i in range(minSampling,maxSampling):#each point
        for _ in range(depth):#number of samples per points
            AllSamples.append(set(random.sample(set(pangenome.organisms), i+1)))
    logging.getLogger().info(f"Done sampling organisms in the pangenome, there are {len(AllSamples)} samples")
    SampNbPerPart = []

    logging.getLogger().info("Computing bitarrays for each family...")
    index_org = pangenome.computeFamilyBitarrays()
    logging.getLogger().info(f"Done computing bitarrays. Comparing them to get exact and soft core stats for {len(AllSamples)} samples...")

    bar = tqdm( range(len(AllSamples) * len(pangenome.geneFamilies)), unit = "gene family", disable=not show_bar)
    for samp in AllSamples:
        #make the sample's organism bitarray.
        sampBitarray = gmpy2.xmpz(0)#pylint: disable=no-member
        for org in samp:
            sampBitarray[index_org[org]] = 1

        part = Counter()
        part["soft_core"] = 0
        part["exact_core"] = 0
        part["exact_accessory"] = 0
        part["soft_accessory"] = 0
        for fam in pangenome.geneFamilies:
            nbCommonOrg = gmpy2.popcount(fam.bitarray & sampBitarray)#pylint: disable=no-member
            part["nborgs"] = len(samp)
            if nbCommonOrg != 0:#in that case the node 'does not exist'
                if nbCommonOrg == len(samp):
                    part["exact_core"] +=1
                else:
                    part["exact_accessory"] +=1

                if float(nbCommonOrg) >= len(samp) * soft_core:
                    part["soft_core"] +=1
                else:
                    part["soft_accessory"] +=1
            bar.update()
        SampNbPerPart.append(part)
    bar.close()
    #done with frequency of each family for each sample.

    global samples
    samples = AllSamples

    args = []
    for index, samp in enumerate(samples):
        args.append((index, tmpdir, beta, sm_degree, free_dispersion, chunk_size, K, krange, seed))

    with Pool(processes = cpu) as p:
        #launch partitionnings
        logging.getLogger().info("Partitionning all samples...")
        bar = tqdm(range(len(args)), unit = "samples partitionned", disable=not show_bar)
        random.shuffle(args)#shuffling the processing so that the progress bar is closer to reality.
        for result in p.imap_unordered(launch_raref_nem, args):
            SampNbPerPart[result[1]] = {**result[0], **SampNbPerPart[result[1]]}
            bar.update()
    bar.close()

    logging.getLogger().info("Done partitionning everything")
    warnings.filterwarnings("ignore")
    drawCurve(output, maxSampling, SampNbPerPart )
    warnings.resetwarnings()
    tmpdirObj.cleanup()
    logging.getLogger().info("Done making the rarefaction curves")
コード例 #16
0
def writeFlatFiles(pangenome,
                   output,
                   cpu=1,
                   soft_core=0.95,
                   dup_margin=0.05,
                   csv=False,
                   genePA=False,
                   gexf=False,
                   light_gexf=False,
                   projection=False,
                   stats=False,
                   json=False,
                   partitions=False,
                   families_tsv=False,
                   all_genes=False,
                   all_prot_families=False,
                   all_gene_families=False,
                   compress=False):
    global pan
    pan = pangenome
    processes = []
    if any(x for x in [
            csv, genePA, gexf, light_gexf, projection, stats, json, partitions,
            families_tsv, all_genes, all_prot_families, all_gene_families
    ]):
        #then it's useful to load the pangenome.
        checkPangenomeInfo(pan,
                           needAnnotations=True,
                           needFamilies=True,
                           needGraph=True)
        ex_partitionned = Exception(
            "The provided pangenome has not been partitionned. This is not compatible with any of the following options : --light_gexf, --gexf, --csv, --partitions"
        )
        ex_genesClustered = Exception(
            "The provided pangenome has not gene families. This is not compatible with any of the following options : --families_tsv --all_prot_families --all_gene_families"
        )
        ex_genomesAnnotated = Exception(
            "The provided pangenome has no annotated sequences. This is not compatible with any of the following options : --all_genes"
        )
        ex_geneSequences = Exception(
            "The provided pangenome has no gene sequences. This is not compatible with any of the following options : --all_genes, --all_gene_families"
        )
        ex_geneFamilySequences = Exception(
            "The provided pangenome has no gene families. This is not compatible with any of the following options : --all_prot_families, all_gene_families"
        )
        if not pan.status["partitionned"] in ["Loaded", "Computed"] and (
                light_gexf or gexf or csv or projection or partitions
        ):  #could allow to write the csv or genePA without partition...
            raise ex_partitionned
        if not pan.status["genesClustered"] in ["Loaded", "Computed"
                                                ] and (families_tsv):
            raise ex_genesClustered
        if not pan.status["genomesAnnotated"] in ["Loaded", "Computed"
                                                  ] and (all_genes):
            raise ex_genomesAnnotated
        if not pan.status["geneSequences"] in ["inFile"] and (
                all_genes or all_gene_families):
            raise ex_geneSequences
        if not pan.status["geneFamilySequences"] in ["Loaded", "Computed"
                                                     ] and (all_prot_families):
            raise ex_geneFamilySequences
        pan.getIndex()  #make the index because it will be used most likely
        with Pool(processes=cpu) as p:
            if csv:
                processes.append(
                    p.apply_async(func=writeMatrix,
                                  args=(',', "csv", output, compress, True)))
            if genePA:
                processes.append(
                    p.apply_async(func=writeGenePresenceAbsence,
                                  args=(output, compress)))
            if gexf:
                processes.append(
                    p.apply_async(func=writeGEXF,
                                  args=(output, False, soft_core, compress)))
            if light_gexf:
                processes.append(
                    p.apply_async(func=writeGEXF,
                                  args=(output, True, soft_core, compress)))
            if projection:
                processes.append(
                    p.apply_async(func=writeProjections,
                                  args=(output, compress)))
            if stats:
                processes.append(
                    p.apply_async(func=writeStats,
                                  args=(output, soft_core, dup_margin,
                                        compress)))
            if json:
                processes.append(
                    p.apply_async(func=writeJSON, args=(output, compress)))
            if partitions:
                processes.append(
                    p.apply_async(func=writeParts,
                                  args=(output, soft_core, compress)))
            if families_tsv:
                processes.append(
                    p.apply_async(func=writeGeneFamiliesTSV,
                                  args=(output, compress)))
            if all_genes:
                processes.append(
                    p.apply_async(func=writeGeneSequences,
                                  args=(output, compress)))
            if all_prot_families:
                processes.append(
                    p.apply_async(func=writeFastaProtFam,
                                  args=(output, compress)))
            if all_gene_families:
                processes.append(
                    p.apply_async(func=writeFastaGenFam,
                                  args=(output, compress)))
            for process in processes:
                process.get()  #get all the results
コード例 #17
0
def align(pangenome,
          sequenceFile,
          output,
          tmpdir,
          identity=0.8,
          coverage=0.8,
          no_defrag=False,
          cpu=1,
          getinfo=False,
          draw_related=False,
          disable_bar=False):
    if pangenome.status["geneFamilySequences"] not in [
            "inFile", "Loaded", "Computed"
    ]:
        raise Exception(
            "Cannot use this function as your pangenome does not have gene families representatives "
            "associated to it. For now this works only if the clustering is realised by PPanGGOLiN."
        )
    # could be possible either by picking a representative somehow, or by aligning on genes rather than on
    # families, if they are in the pangenome.

    if getinfo or draw_related:
        need_mod = False
        if pangenome.status["modules"] != "No":
            # modules are not required to be loaded, but if they have been computed we load them.
            need_mod = True
        checkPangenomeInfo(pangenome,
                           needAnnotations=True,
                           needFamilies=True,
                           needPartitions=True,
                           needRGP=True,
                           needSpots=True,
                           needModules=need_mod,
                           disable_bar=disable_bar)
    else:
        checkPangenomeInfo(pangenome,
                           needFamilies=True,
                           disable_bar=disable_bar)

    new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir)

    seqSet, alignFile, seq2pang = get_seq2pang(pangenome, sequenceFile, output,
                                               new_tmpdir, cpu, no_defrag,
                                               identity, coverage)

    if getinfo or draw_related:
        getSeqInfo(seq2pang,
                   pangenome,
                   output,
                   draw_related,
                   disable_bar=disable_bar)
    partProj = projectPartition(seq2pang, seqSet,
                                output)  # write the partition assignation only
    logging.getLogger().info(f"sequences partition projection : '{partProj}'")
    logging.getLogger().info(
        f"{len(seq2pang)} sequences over {len(seqSet)} have at least one hit in the pangenome."
    )
    logging.getLogger().info(
        f"Blast-tab file of the alignment : '{alignFile}'")

    new_tmpdir.cleanup()
コード例 #18
0
ファイル: writeSequences.py プロジェクト: tauqeer9/PPanGGOLiN
def writeSequenceFiles(pangenome,
                       output,
                       fasta=None,
                       anno=None,
                       cpu=1,
                       regions=None,
                       genes=None,
                       gene_families=None,
                       prot_families=None,
                       compress=False,
                       show_bar=True):
    if not any(x for x in [regions, genes, prot_families, gene_families]):
        raise Exception("You did not indicate what file you wanted to write.")

    needAnnotations = False
    needFamilies = False
    needGraph = False
    needPartitions = False
    needSpots = False
    needRegions = False

    if any(x is not None
           for x in [regions, genes, gene_families, prot_families]):
        needAnnotations = True
    if regions is not None or any(
            x == "rgp" for x in (genes, gene_families, prot_families)):
        needRegions = True
    if regions is not None or gene_families is not None or prot_families is not None or genes in [
            "persistent", "shell", "cloud"
    ]:
        needFamilies = True
    if any(x in ["persistent", "shell", "cloud"]
           for x in (genes, gene_families, prot_families)):
        needPartitions = True

    #need to deal with sequence-related flags outside of checkPangenomeInfo since
    ex_geneSequences = Exception(
        "The provided pangenome has no gene sequences. This is not compatible with any of the following options : --genes, --gene_families"
    )
    ex_geneFamilySequences = Exception(
        "The provided pangenome has no gene families. This is not compatible with any of the following options : --prot_families, --gene_families"
    )
    if not pangenome.status["geneSequences"] in ["inFile"] and (genes or
                                                                gene_families):
        raise ex_geneSequences
    if not pangenome.status["geneFamilySequences"] in [
            "Loaded", "Computed", "inFile"
    ] and prot_families:
        raise ex_geneFamilySequences

    checkPangenomeInfo(pangenome,
                       needAnnotations=needAnnotations,
                       needFamilies=needFamilies,
                       needGraph=needGraph,
                       needPartitions=needPartitions,
                       needRGP=needRegions,
                       needSpots=needSpots)

    if prot_families is not None:
        writeFastaProtFam(pangenome,
                          output,
                          compress,
                          prot_families,
                          show_bar=show_bar)
    if gene_families is not None:
        writeFastaGeneFam(pangenome,
                          output,
                          compress,
                          gene_families,
                          show_bar=show_bar)
    if genes is not None:
        writeGeneSequences(pangenome,
                           output,
                           compress,
                           genes,
                           show_bar=show_bar)
    if regions is not None:
        writeRegionsSequences(pangenome,
                              output,
                              compress,
                              regions,
                              fasta,
                              anno,
                              show_bar=show_bar)
コード例 #19
0
ファイル: cluster.py プロジェクト: labgem/PPanGGOLiN
def readClustering(pangenome,
                   families_tsv_file,
                   infer_singletons=False,
                   force=False,
                   disable_bar=False):
    """
        Creates the pangenome, the gene families and the genes with an associated gene family.
        Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome.
    """
    checkPangenomeFormerClustering(pangenome, force)
    checkPangenomeInfo(pangenome,
                       needAnnotations=True,
                       disable_bar=disable_bar)

    logging.getLogger().info("Reading " + families_tsv_file +
                             " the gene families file ...")
    filesize = os.stat(families_tsv_file).st_size
    families_tsv_file = read_compressed_or_not(families_tsv_file)
    frag = False
    # the genome annotations are necessarily loaded.
    nbGeneWithFam = 0
    localDict = mkLocal2Gene(pangenome)
    bar = tqdm(total=filesize, unit="bytes", disable=disable_bar)
    lineCounter = 0
    for line in families_tsv_file:
        lineCounter += 1
        bar.update(len(line))
        try:
            elements = [el.strip()
                        for el in line.split()]  # 2 or 3 fields expected
            if len(elements) <= 1:
                raise ValueError(
                    "No tabulation separator found in gene families file")
            (fam_id, gene_id,
             is_frag) = elements if len(elements) == 3 else elements + [None]
            try:
                geneObj = pangenome.getGene(gene_id)
            except KeyError:
                geneObj = localDict.get(gene_id)
            if geneObj is not None:
                nbGeneWithFam += 1
                fam = pangenome.addGeneFamily(fam_id)
                geneObj.is_fragment = True if is_frag == "F" else False
                fam.addGene(geneObj)
            if is_frag == "F":
                frag = True
        except:
            raise Exception(
                f"line {lineCounter} of the file '{families_tsv_file.name}' raised an error."
            )
    bar.close()
    families_tsv_file.close()
    if nbGeneWithFam < len(
            pangenome.genes):  # not all genes have an associated cluster
        if nbGeneWithFam == 0:
            raise Exception(
                "No gene ID in the cluster file matched any gene ID from the annotation step."
                " Please ensure that the annotations that you loaded previously and the clustering results "
                "that you have used the same gene IDs. If you use .gff files it is the identifier stored in"
                " the field 'ID'. If you use .gbff files it is the identifier stored in 'locus_tag'."
            )
        else:
            if infer_singletons:
                inferSingletons(pangenome)
            else:
                raise Exception(
                    f"Some genes ({len(pangenome.genes) - nbGeneWithFam}) did not have an associated "
                    f"cluster. Either change your cluster file so that each gene has a cluster, "
                    f"or use the --infer_singletons option to infer a cluster for each non-clustered gene."
                )
    pangenome.status["genesClustered"] = "Computed"
    if frag:  # if there was fragment information in the file.
        pangenome.status["defragmented"] = "Computed"
    pangenome.parameters["cluster"] = {}
    pangenome.parameters["cluster"]["read_clustering_from_file"] = True
    pangenome.parameters["cluster"]["infer_singletons"] = infer_singletons
コード例 #20
0
ファイル: writeFlat.py プロジェクト: zhaoc1/PPanGGOLiN
def writeFlatFiles(pangenome,
                   output,
                   cpu=1,
                   soft_core=0.95,
                   dup_margin=0.05,
                   csv=False,
                   genePA=False,
                   gexf=False,
                   light_gexf=False,
                   projection=False,
                   stats=False,
                   json=False,
                   partitions=False,
                   regions=False,
                   families_tsv=False,
                   all_genes=False,
                   all_prot_families=False,
                   all_gene_families=False,
                   spots=False,
                   borders=False,
                   compress=False):

    if not any(x for x in [
            csv, genePA, gexf, light_gexf, projection, stats, json, partitions,
            regions, spots, borders, families_tsv, all_genes,
            all_prot_families, all_gene_families
    ]):
        raise Exception("You did not indicate what file you wanted to write.")

    global pan
    pan = pangenome
    processes = []
    needAnnotations = False
    needFamilies = False
    needGraph = False
    needPartitions = False
    needSpots = False
    needRegions = False

    if csv or genePA or gexf or light_gexf or projection or stats or json or partitions or regions or spots or families_tsv or borders:
        needAnnotations = True
    if csv or genePA or gexf or light_gexf or projection or stats or json or partitions or regions or spots or families_tsv or all_prot_families or all_gene_families or borders:
        needFamilies = True
    if projection or stats or partitions or regions or spots or borders:
        needPartitions = True
    if gexf or light_gexf or json:
        needGraph = True
    if regions or spots or borders:
        needRegions = True
    if spots or borders:
        needSpots = True

    #need to deal with sequence-related flags outside of checkPangenomeInfo since
    ex_geneSequences = Exception(
        "The provided pangenome has no gene sequences. This is not compatible with any of the following options : --all_genes, --all_gene_families"
    )
    ex_geneFamilySequences = Exception(
        "The provided pangenome has no gene families. This is not compatible with any of the following options : --all_prot_families, all_gene_families"
    )
    if not pan.status["geneSequences"] in ["inFile"] and (all_genes or
                                                          all_gene_families):
        raise ex_geneSequences
    if not pan.status["geneFamilySequences"] in [
            "Loaded", "Computed", "inFile"
    ] and (all_prot_families):
        raise ex_geneFamilySequences

    checkPangenomeInfo(pan,
                       needAnnotations=needAnnotations,
                       needFamilies=needFamilies,
                       needGraph=needGraph,
                       needPartitions=needPartitions,
                       needRGP=needRegions,
                       needSpots=needSpots)
    pan.getIndex()  #make the index because it will be used most likely
    with Pool(processes=cpu) as p:
        if csv:
            processes.append(
                p.apply_async(func=writeMatrix,
                              args=(',', "csv", output, compress, True)))
        if genePA:
            processes.append(
                p.apply_async(func=writeGenePresenceAbsence,
                              args=(output, compress)))
        if gexf:
            processes.append(
                p.apply_async(func=writeGEXF,
                              args=(output, False, soft_core, compress)))
        if light_gexf:
            processes.append(
                p.apply_async(func=writeGEXF,
                              args=(output, True, soft_core, compress)))
        if projection:
            processes.append(
                p.apply_async(func=writeProjections, args=(output, compress)))
        if stats:
            processes.append(
                p.apply_async(func=writeStats,
                              args=(output, soft_core, dup_margin, compress)))
        if json:
            processes.append(
                p.apply_async(func=writeJSON, args=(output, compress)))
        if partitions:
            processes.append(
                p.apply_async(func=writeParts,
                              args=(output, soft_core, compress)))
        if families_tsv:
            processes.append(
                p.apply_async(func=writeGeneFamiliesTSV,
                              args=(output, compress)))
        if all_genes:
            processes.append(
                p.apply_async(func=writeGeneSequences,
                              args=(output, compress)))
        if all_prot_families:
            processes.append(
                p.apply_async(func=writeFastaProtFam, args=(output, compress)))
        if all_gene_families:
            processes.append(
                p.apply_async(func=writeFastaGenFam, args=(output, compress)))
        if regions:
            processes.append(
                p.apply_async(func=writeRegions, args=(output, compress)))
        if spots:
            processes.append(
                p.apply_async(func=writeSpots, args=(output, compress)))
        if borders:
            processes.append(
                p.apply_async(func=writeBorders,
                              args=(output, dup_margin, compress)))

        for process in processes:
            process.get()  #get all the results