コード例 #1
0
def writeGeneSequences(output, compress=False):
    logging.getLogger().info("Writing all the gene nucleic sequences...")
    outname = output + "/all_genes.fna"
    with write_compressed_or_not(outname, compress) as fasta:
        getGeneSequencesFromFile(pan, fasta)
    logging.getLogger().info(
        f"Done writing all the gene sequences : '{outname}'")
コード例 #2
0
ファイル: writeSequences.py プロジェクト: tauqeer9/PPanGGOLiN
def writeFastaGeneFam(pangenome,
                      output,
                      compress,
                      gene_families,
                      show_bar=True):
    outname = output + f"/{gene_families}_nucleotide_families.fasta"

    genefams = set()
    if gene_families == 'all':
        logging.getLogger().info(
            "Writing all of the representative nucleotide sequences of the gene families..."
        )
        genefams = pangenome.geneFamilies
    if gene_families in ['persistent', 'shell', 'cloud']:
        logging.getLogger().info(
            f"Writing the representative nucleotide sequences of the {gene_families} gene families..."
        )
        for fam in pangenome.geneFamilies:
            if fam.namedPartition == gene_families:
                genefams.add(fam)
    if gene_families == "rgp":
        logging.getLogger().info(
            f"Writing the representative nucleotide sequences of the gene families in RGPs..."
        )
        for region in pangenome.regions:
            genefams |= region.families

    with write_compressed_or_not(outname, compress) as fasta:
        getGeneSequencesFromFile(pangenome.file,
                                 fasta, [fam.name for fam in genefams],
                                 show_bar=show_bar)

    logging.getLogger().info(
        f"Done writing the representative nucleotide sequences of the gene families : '{outname}'"
    )
コード例 #3
0
ファイル: writeSequences.py プロジェクト: tauqeer9/PPanGGOLiN
def writeGeneSequences(pangenome, output, compress, genes, show_bar=True):
    logging.getLogger().info("Writing all the gene nucleic sequences...")
    outname = output + f"/{genes}_genes.fna"

    genes_to_write = []
    if genes == 'all':
        logging.getLogger().info("Writing all of the gene sequences...")
        genes_to_write = pangenome.genes
    if genes in ['persistent', 'shell', 'cloud']:
        logging.getLogger().info(
            f"Writing all of the {genes} gene sequences...")
        for gene in pangenome.genes:
            if gene.family.namedPartition == genes:
                genes_to_write.append(gene)
    if genes == "rgp":
        logging.getLogger().info(
            f"Writing all of the gene sequences in RGP...")
        for region in pangenome.regions:
            genes_to_write.extend(region.genes)
    logging.getLogger().info(f"There are {len(genes_to_write)} genes to write")
    with write_compressed_or_not(outname, compress) as fasta:
        if pangenome.status["geneSequences"] in ["inFile"]:
            getGeneSequencesFromFile(pangenome.file,
                                     fasta,
                                     set([gene.ID for gene in genes_to_write]),
                                     show_bar=show_bar)
        elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]:
            writeGeneSequencesFromAnnotations(pangenome,
                                              fasta,
                                              genes_to_write,
                                              show_bar=show_bar)
        else:
            #this should never happen if the pangenome has been properly checked before launching this function.
            raise Exception("The pangenome does not include gene sequences")
    logging.getLogger().info(f"Done writing the gene sequences : '{outname}'")
コード例 #4
0
ファイル: cluster.py プロジェクト: labgem/PPanGGOLiN
def checkPangenomeForClustering(pangenome, tmpFile, force, disable_bar=False):
    """
        Check the pangenome statuses and write the gene sequences in the provided tmpFile.
        (whether they are written in the .h5 file or currently in memory)
    """
    checkPangenomeFormerClustering(pangenome, force)
    if pangenome.status["geneSequences"] in ["Computed", "Loaded"]:
        writeGeneSequencesFromAnnotations(
            pangenome, tmpFile, add="ppanggolin_", disable_bar=disable_bar
        )  #we append the gene ids by 'ppanggolin' to avoid crashes from mmseqs when sequence IDs are only numeric.
    elif pangenome.status["geneSequences"] == "inFile":
        getGeneSequencesFromFile(
            pangenome.file,
            tmpFile,
            add="ppanggolin_",
            disable_bar=disable_bar)  # write CDS sequences to the tmpFile
    else:
        tmpFile.close(
        )  # closing the tmp file since an exception will be raised.
        raise Exception(
            "The pangenome does not include gene sequences, thus it is impossible to cluster "
            "the genes in gene families. Either provide clustering results (see --clusters), "
            "or provide a way to access the gene sequence during the annotation step "
            "(having the fasta in the gff files, or providing the fasta files through the --fasta option)"
        )
コード例 #5
0
ファイル: writeSequences.py プロジェクト: labgem/PPanGGOLiN
def writeFastaGeneFam(pangenome, output, compress, gene_families, soft_core=0.95, disable_bar=False):
    outname = output + f"/{gene_families}_nucleotide_families.fasta"

    genefams = selectFamilies(pangenome, gene_families, "representative nucleotide sequences of the gene families",
                              soft_core)

    with write_compressed_or_not(outname, compress) as fasta:
        getGeneSequencesFromFile(pangenome.file, fasta, [fam.name for fam in genefams], disable_bar=disable_bar)

    logging.getLogger().info(f"Done writing the representative nucleotide sequences of the gene families : '{outname}'")
コード例 #6
0
def writeFastaGenFam(output, compress=False):
    logging.getLogger().info(
        "Writing the representative nucleic sequences of all the gene families..."
    )
    outname = output + "/representative_gene_families.fna"
    with write_compressed_or_not(outname, compress) as fasta:
        getGeneSequencesFromFile(pan, fasta,
                                 [fam.name for fam in pan.geneFamilies])
    logging.getLogger().info(
        f"Done writing the representative nucleic sequences of all the gene families : '{outname}'"
    )
コード例 #7
0
ファイル: writeFlat.py プロジェクト: zhaoc1/PPanGGOLiN
def writeGeneSequences(output, compress=False):
    logging.getLogger().info("Writing all the gene nucleic sequences...")
    outname = output + "/all_genes.fna"

    with write_compressed_or_not(outname, compress) as fasta:
        if pan.status["geneSequences"] in ["inFile"]:
            getGeneSequencesFromFile(pan, fasta)
        elif pan.status["geneSequences"] in ["Computed", "Loaded"]:
            writeGeneSequencesFromAnnotations(pan, fasta)
        else:
            #this should never happen if the pangenome has been properly checked before launching this function.
            raise Exception("The pangenome does not include gene sequences")
    logging.getLogger().info(
        f"Done writing all the gene sequences : '{outname}'")
コード例 #8
0
def checkPangenomeForClustering(pangenome, tmpFile, force):
    """
        Check the pangenome statuses and write the gene sequences in the provided tmpFile. (whether they are written in the .h5 file or currently in memory)
    """
    checkPangenomeFormerClustering(pangenome, force)
    if pangenome.status["geneSequences"] in ["Computed", "Loaded"]:
        writeGeneSequencesFromAnnotations(pangenome, tmpFile)
    elif pangenome.status["geneSequences"] == "inFile":
        getGeneSequencesFromFile(pangenome,
                                 tmpFile)  #write CDS sequences to the tmpFile
    else:
        tmpFile.close(
        )  #closing the tmp file since an exception will be raised.
        raise Exception(
            "The pangenome does not include gene sequences, thus it is impossible to cluster the genes in gene families. Either provide clustering results (see --clusters), or provide a way to access the gene sequence during the annotation step (having the fasta in the gff files, or providing the fasta files through the --fasta option)"
        )
コード例 #9
0
ファイル: writeSequences.py プロジェクト: labgem/PPanGGOLiN
def writeGeneSequences(pangenome, output, compress, genes, soft_core=0.95, disable_bar=False):
    logging.getLogger().info("Writing all the gene nucleotide sequences...")
    outname = output + f"/{genes}_genes.fna"

    genefams = selectFamilies(pangenome, genes, "gene nucleotide sequences", soft_core)
    genes_to_write = []

    for fam in genefams:
        genes_to_write.extend(fam.genes)

    logging.getLogger().info(f"There are {len(genes_to_write)} genes to write")
    with write_compressed_or_not(outname, compress) as fasta:
        if pangenome.status["geneSequences"] in ["inFile"]:
            getGeneSequencesFromFile(pangenome.file, fasta, set([gene.ID for gene in genes_to_write]),
                                     disable_bar=disable_bar)
        elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]:
            writeGeneSequencesFromAnnotations(pangenome, fasta, genes_to_write, disable_bar=disable_bar)
        else:
            # this should never happen if the pangenome has been properly checked before launching this function.
            raise Exception("The pangenome does not include gene sequences")
    logging.getLogger().info(f"Done writing the gene sequences : '{outname}'")