Example #1
0
def writeOrgFile(org, output, compress=False):
    with write_compressed_or_not(output + "/" + org.name + ".tsv",compress) as outfile:
        outfile.write("\t".join(["gene","contig","start","stop","strand","ori","family","nb_copy_in_org","partition","persistent_neighbors","shell_neighbors","cloud_neighbors"]) + "\n")
        for contig in org.contigs:
            for gene in contig.genes:
                nb_pers = 0
                nb_shell = 0
                nb_cloud = 0
                for neighbor in gene.family.neighbors:
                    if neighbor.namedPartition == "persistent":
                        nb_pers+=1
                    elif neighbor.namedPartition == "shell":
                        nb_shell+=1
                    else:
                        nb_cloud+=1
                outfile.write("\t".join(map(str,[ gene.ID if gene.local_identifier == "" else gene.local_identifier,
                                        contig.name,
                                        gene.start,
                                        gene.stop,
                                        gene.strand,
                                        "T" if (gene.name.upper() == "DNAA" or gene.product.upper() == "DNAA") else "F",
                                        gene.family.name,
                                        len(gene.family.getGenesPerOrg(org)),
                                        gene.family.namedPartition,
                                        nb_pers,
                                        nb_shell,
                                        nb_cloud
                                        ])) + "\n")
Example #2
0
def writeSpotModules(output, compress):
    logging.getLogger().info("Writing modules to spot associations...")

    fam2mod = {}
    for mod in pan.modules:
        for fam in mod.families:
            fam2mod[fam] = mod

    with write_compressed_or_not(output + "/modules_spots.tsv", compress) as fout:
        fout.write("module_id\tspot_id\n")

        for spot in pan.spots:
            curr_mods = defaultdict(set)
            for rgp in spot.getUniqContent():
                for fam in rgp.families:
                    mod = fam2mod.get(fam)
                    if mod is not None:
                        curr_mods[mod].add(fam)

            for mod in curr_mods:
                if curr_mods[mod] == mod.families:
                    # if all the families in the module are found in the spot, write the association
                    fout.write(f"module_{mod.ID}\tspot_{spot.ID}\n")

    logging.getLogger().info(f"Done writing module to spot associations to: {output + '/modules_spots.tsv'}")
Example #3
0
def writeRegionsSequences(pangenome, output, compress, regions, fasta, anno, disable_bar=False):
    organisms_file = fasta if fasta is not None else anno
    org_dict = {}
    for line in read_compressed_or_not(organisms_file):
        elements = [el.strip() for el in line.split("\t")]
        if len(elements) <= 1:
            logging.getLogger().error(
                f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'")
            exit(1)
        org_dict[elements[0]] = elements[1]

    logging.getLogger().info(f"Writing {regions} rgp genomic sequences...")
    regions_to_write = []
    if regions == "complete":
        for region in pangenome.regions:
            if not region.isContigBorder:
                regions_to_write.append(region)
    else:
        regions_to_write = pangenome.regions

    regions_to_write = sorted(regions_to_write, key=lambda x: x.organism.name)
    # order regions by organism, so that we only have to read one genome at the time

    outname = output + f"/{regions}_rgp_genomic_sequences.fasta"
    with write_compressed_or_not(outname, compress) as fasta:
        loaded_genome = ""
        bar = tqdm(regions_to_write, unit="rgp", disable=disable_bar)
        for region in bar:
            if region.organism.name != loaded_genome:
                loaded_genome = region.organism.name
                genome_sequence = read_genome_file(org_dict, loaded_genome)
            fasta.write(f">{region.name}\n")
            fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.start:region.stop], 60))
        bar.close()
    logging.getLogger().info(f"Done writing the regions nucleotide sequences: '{outname}'")
Example #4
0
def writeRegions(output, compress = False):
    fname = output + "/plastic_regions.tsv"
    with write_compressed_or_not(fname, compress) as tab:
        tab.write("region\torganism\tcontig\tstart\tstop\tgenes\tcontigBorder\twholeContig\n")
        regions = sorted(pan.regions, key = lambda x : (x.organism.name, x.contig.name, x.start))
        for region in regions:
            tab.write('\t'.join(map(str,[region.name, region.organism, region.contig, region.start, region.stop, len(region.genes), region.isContigBorder, region.isWholeContig]))+"\n")
Example #5
0
def summarize_spots(spots, output, compress):

    def r_and_s(value):
        """ rounds to dp figures and returns a str of the provided value"""
        if isinstance(value, float):
            return str(round(value,3))
        else:
            return str(value)

    with write_compressed_or_not(output + "/summarize_spots.tsv", compress) as fout:
        fout.write("spot\tnb_rgp\tnb_families\tnb_unique_family_sets\tmean_nb_genes\tstdev_nb_genes\tmax_nb_genes\tmin_nb_genes\n")
        for spot in sorted(spots, key=lambda x : len(x.regions), reverse=True):
            tot_fams = set()
            rgp_list = list(spot.regions)
            len_uniq_content = len(spot.getUniqContent())
            size_list = []
            for rgp in spot.regions:
                tot_fams |= rgp.families
                size_list.append(len(rgp.genes))
            mean_size = mean(size_list)
            stdev_size = stdev(size_list) if len(size_list) > 1 else 0
            max_size = max(size_list)
            min_size = min(size_list)
            fout.write("\t".join(map(r_and_s,[f"spot_{spot.ID}", len(rgp_list), len(tot_fams), len_uniq_content, mean_size,stdev_size,max_size, min_size])) + "\n")
    logging.getLogger().info(f"Done writing spots in : '{output + '/summarize_spots.tsv'}'")
Example #6
0
def writeFastaGeneFam(pangenome,
                      output,
                      compress,
                      gene_families,
                      show_bar=True):
    outname = output + f"/{gene_families}_nucleotide_families.fasta"

    genefams = set()
    if gene_families == 'all':
        logging.getLogger().info(
            "Writing all of the representative nucleotide sequences of the gene families..."
        )
        genefams = pangenome.geneFamilies
    if gene_families in ['persistent', 'shell', 'cloud']:
        logging.getLogger().info(
            f"Writing the representative nucleotide sequences of the {gene_families} gene families..."
        )
        for fam in pangenome.geneFamilies:
            if fam.namedPartition == gene_families:
                genefams.add(fam)
    if gene_families == "rgp":
        logging.getLogger().info(
            f"Writing the representative nucleotide sequences of the gene families in RGPs..."
        )
        for region in pangenome.regions:
            genefams |= region.families

    with write_compressed_or_not(outname, compress) as fasta:
        getGeneSequencesFromFile(pangenome.file,
                                 fasta, [fam.name for fam in genefams],
                                 show_bar=show_bar)

    logging.getLogger().info(
        f"Done writing the representative nucleotide sequences of the gene families : '{outname}'"
    )
Example #7
0
def writeGeneSequences(pangenome, output, compress, genes, show_bar=True):
    logging.getLogger().info("Writing all the gene nucleic sequences...")
    outname = output + f"/{genes}_genes.fna"

    genes_to_write = []
    if genes == 'all':
        logging.getLogger().info("Writing all of the gene sequences...")
        genes_to_write = pangenome.genes
    if genes in ['persistent', 'shell', 'cloud']:
        logging.getLogger().info(
            f"Writing all of the {genes} gene sequences...")
        for gene in pangenome.genes:
            if gene.family.namedPartition == genes:
                genes_to_write.append(gene)
    if genes == "rgp":
        logging.getLogger().info(
            f"Writing all of the gene sequences in RGP...")
        for region in pangenome.regions:
            genes_to_write.extend(region.genes)
    logging.getLogger().info(f"There are {len(genes_to_write)} genes to write")
    with write_compressed_or_not(outname, compress) as fasta:
        if pangenome.status["geneSequences"] in ["inFile"]:
            getGeneSequencesFromFile(pangenome.file,
                                     fasta,
                                     set([gene.ID for gene in genes_to_write]),
                                     show_bar=show_bar)
        elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]:
            writeGeneSequencesFromAnnotations(pangenome,
                                              fasta,
                                              genes_to_write,
                                              show_bar=show_bar)
        else:
            #this should never happen if the pangenome has been properly checked before launching this function.
            raise Exception("The pangenome does not include gene sequences")
    logging.getLogger().info(f"Done writing the gene sequences : '{outname}'")
Example #8
0
def writeGeneSequences(output, compress=False):
    logging.getLogger().info("Writing all the gene nucleic sequences...")
    outname = output + "/all_genes.fna"
    with write_compressed_or_not(outname, compress) as fasta:
        getGeneSequencesFromFile(pan, fasta)
    logging.getLogger().info(
        f"Done writing all the gene sequences : '{outname}'")
Example #9
0
def writeGeneFamiliesTSV(output, compress=False):
    logging.getLogger().info("Writing the file providing the association between genes and gene families...")
    outname = output + "/gene_families.tsv"
    with write_compressed_or_not(outname,compress) as tsv:
        for fam in pan.geneFamilies:
            for gene in fam.genes:
                tsv.write("\t".join([fam.name, gene.ID if gene.local_identifier == "" else gene.local_identifier, "F" if gene.is_fragment else ""])+"\n")
    logging.getLogger().info(f"Done writing the file providing the association between genes and gene families : '{outname}'")
Example #10
0
def spot2rgp(spots, output, compress):
    with write_compressed_or_not(output + "/spots.tsv", compress) as fout:
        fout.write("spot_id\trgp_id\n")
        n_spot = 0
        for spot in spots:
            for rgp in spot.regions:
                fout.write(f"spot_{spot.ID}\t{rgp.name}\n")
            n_spot+=1
Example #11
0
def writeJSON(output, compress):
    logging.getLogger().info("Writing the json file for the pangenome graph...")
    outname = output + "/pangenomeGraph.json"
    with write_compressed_or_not(outname, compress) as json:
        writeJSONheader(json)
        writeJSONnodes(json)
        writeJSONedges(json)
        json.write("}")
    logging.getLogger().info(f"Done writing the json file : '{outname}'")
Example #12
0
def writeBorders(output, dup_margin, compress):
    multigenics = pan.get_multigenics(dup_margin=dup_margin)
    all_fams = set()
    with write_compressed_or_not(output+"/spot_borders.tsv",compress) as fout:
        fout.write("spot_id\tnumber\tborder1\tborder2\n")
        for spot in sorted(pan.spots, key= lambda x: len(x.regions), reverse=True):
            curr_borders=spot.borders(pan.parameters["spots"]["set_size"], multigenics)
            for c, border in curr_borders:
                famstring1 = ",".join([ fam.name for fam in border[0] ])
                famstring2 = ",".join([ fam.name for fam in border[1]])
                all_fams |= set(border[0])
                all_fams |= set(border[1])
                fout.write(f"{spot.ID}\t{c}\t{famstring1}\t{famstring2}\n")

    with write_compressed_or_not(output + "/border_protein_genes.fasta",compress) as fout:
        for fam in all_fams:
            fout.write(f">{fam.name}\n")
            fout.write(f"{fam.sequence}\n")
Example #13
0
def writeModules(output, compress):
    logging.getLogger().info("Writing functional modules...")
    with write_compressed_or_not(output + "/functional_modules.tsv", compress) as fout:
        fout.write("module_id\tfamily_id\n")
        for mod in pan.modules:
            for family in mod.families:
                fout.write(f"module_{mod.ID}\t{family.name}\n")
        fout.close()

    logging.getLogger().info(f"Done writing functional modules to: '{output + '/functional_modules.tsv'}'")
Example #14
0
def writeFastaGeneFam(pangenome, output, compress, gene_families, soft_core=0.95, disable_bar=False):
    outname = output + f"/{gene_families}_nucleotide_families.fasta"

    genefams = selectFamilies(pangenome, gene_families, "representative nucleotide sequences of the gene families",
                              soft_core)

    with write_compressed_or_not(outname, compress) as fasta:
        getGeneSequencesFromFile(pangenome.file, fasta, [fam.name for fam in genefams], disable_bar=disable_bar)

    logging.getLogger().info(f"Done writing the representative nucleotide sequences of the gene families : '{outname}'")
Example #15
0
def writeFastaGenFam(output, compress=False):
    logging.getLogger().info(
        "Writing the representative nucleic sequences of all the gene families..."
    )
    outname = output + "/representative_gene_families.fna"
    with write_compressed_or_not(outname, compress) as fasta:
        getGeneSequencesFromFile(pan, fasta,
                                 [fam.name for fam in pan.geneFamilies])
    logging.getLogger().info(
        f"Done writing the representative nucleic sequences of all the gene families : '{outname}'"
    )
Example #16
0
def writeFastaProtFam(pangenome, output, compress, prot_families, soft_core=0.95, disable_bar=False):
    outname = output + f"/{prot_families}_protein_families.faa"

    genefams = selectFamilies(pangenome, prot_families, "representative amino acid sequences of the gene families",
                              soft_core)

    with write_compressed_or_not(outname, compress) as fasta:
        bar = tqdm(genefams, unit="prot families", disable=disable_bar)
        for fam in bar:
            fasta.write('>' + fam.name + "\n")
            fasta.write(fam.sequence + "\n")
        bar.close()
    logging.getLogger().info(f"Done writing the representative amino acid sequences of the gene families : '{outname}'")
Example #17
0
def writeGeneSequences(output, compress=False):
    logging.getLogger().info("Writing all the gene nucleic sequences...")
    outname = output + "/all_genes.fna"

    with write_compressed_or_not(outname, compress) as fasta:
        if pan.status["geneSequences"] in ["inFile"]:
            getGeneSequencesFromFile(pan, fasta)
        elif pan.status["geneSequences"] in ["Computed", "Loaded"]:
            writeGeneSequencesFromAnnotations(pan, fasta)
        else:
            #this should never happen if the pangenome has been properly checked before launching this function.
            raise Exception("The pangenome does not include gene sequences")
    logging.getLogger().info(
        f"Done writing all the gene sequences : '{outname}'")
Example #18
0
def writeOrgModules(output, compress):
    logging.getLogger().info("Writing modules to organisms associations...")
    with write_compressed_or_not(output + "/modules_in_organisms.tsv", compress) as fout:
        fout.write("module_id\torganism\tcompletion\n")
        for mod in pan.modules:
            mod_orgs = set()
            for fam in mod.families:
                mod_orgs |= fam.organisms
            for org in mod_orgs:
                completion = round(len(org.families & mod.families) / len(mod.families), 2)
                fout.write(f"module_{mod.ID}\t{org.name}\t{completion}\n")
        fout.close()
    logging.getLogger().info(
        f"Done writing modules to organisms associations to: '{output + '/modules_in_organisms.tsv'}'")
Example #19
0
def writeGEXF(output, light = True, soft_core = 0.95, compress=False):
    txt = "Writing the gexf file for the pangenome graph..."
    if light:
        txt = "Writing the light gexf file for the pangenome graph..."
    logging.getLogger().info(txt)
    outname = output + "/pangenomeGraph"
    outname += "_light" if light else ""
    outname += ".gexf"
    with write_compressed_or_not(outname,compress) as gexf:
        writeGEXFheader(gexf, light)
        writeGEXFnodes(gexf, light)
        writeGEXFedges(gexf, light)
        writeGEXFend(gexf)
    logging.getLogger().info(f"Done writing the gexf file : '{outname}'")
Example #20
0
def writeFastaProtFam(output, compress=False):
    logging.getLogger().info(
        "Writing the representative proteic sequences of all the gene families..."
    )
    outname = output + "/representative_gene_families.faa"
    with write_compressed_or_not(outname, compress) as fasta:
        bar = tqdm(range(pan.number_of_geneFamilies()), unit="prot families")
        for fam in list(pan.geneFamilies):
            fasta.write('>' + fam.name + "\n")
            fasta.write(fam.sequence + "\n")
            bar.update()
        bar.close()
    logging.getLogger().info(
        f"Done writing the representative proteic sequences of all the gene families : '{outname}'"
    )
Example #21
0
def writeModuleSummary(output, compress):
    logging.getLogger().info("Writing functional modules summary...")
    with write_compressed_or_not(output + "/modules_summary.tsv", compress) as fout:
        fout.write("module_id\tnb_families\tnb_organisms\tpartition\tmean_number_of_occurrence\n")
        for mod in pan.modules:
            org_dict = defaultdict(set)
            partition_counter = Counter()
            for family in mod.families:
                partition_counter[family.namedPartition] += 1
                for gene in family.genes:
                    org_dict[gene.organism].add(gene)
            fout.write(
                f"module_{mod.ID}\t{len(mod.families)}\t{len(org_dict)}\t{partition_counter.most_common(1)[0][0]}\t"
                f"{round((sum([len(genes) for genes in org_dict.values()]) / len(org_dict)) / len(mod.families), 3)}\n")
        fout.close()

    logging.getLogger().info(f"Done writing module summary: '{output + '/modules_summary.tsv'}'")
Example #22
0
def writeOrgFile(org, output, compress=False):
    with write_compressed_or_not(output + "/" + org.name + ".tsv", compress) as outfile:
        header = ["gene", "contig", "start", "stop", "strand", "family", "nb_copy_in_org",
                  "partition", "persistent_neighbors", "shell_neighbors", "cloud_neighbors"]
        if needRegions:
            header.append("RGPs")
        if needSpots:
            header.append("Spots")
        if needModules:
            header.append("Modules")
        outfile.write("\t".join(header) + "\n")
        for contig in org.contigs:
            for gene in contig.genes:
                nb_pers = 0
                nb_shell = 0
                nb_cloud = 0
                modules = None
                RGP = None
                spot = None
                for neighbor in gene.family.neighbors:
                    if neighbor.namedPartition == "persistent":
                        nb_pers += 1
                    elif neighbor.namedPartition == "shell":
                        nb_shell += 1
                    else:
                        nb_cloud += 1
                row = [gene.ID if gene.local_identifier == "" else gene.local_identifier,
                       contig.name, gene.start, gene.stop, gene.strand, gene.family.name,
                       len(gene.family.getGenesPerOrg(org)), gene.family.namedPartition,
                       nb_pers, nb_shell, nb_cloud]
                if needRegions:
                    if len(gene.RGP) > 0:
                        RGP = ','.join([str(region.name) for region in gene.RGP])
                    row.append(RGP)
                if needSpots:
                    if len(gene.family.spot) > 0:
                        spot = ','.join([str(s.ID) for s in gene.family.spot])
                    row.append(spot)
                if needModules:
                    if len(gene.family.modules) > 0:
                        modules = ','.join(["module_" + str(module.ID) for module in gene.family.modules])
                    row.append(modules)
                outfile.write("\t".join(map(str, row)) + "\n")
Example #23
0
def writeGeneSequences(pangenome, output, compress, genes, soft_core=0.95, disable_bar=False):
    logging.getLogger().info("Writing all the gene nucleotide sequences...")
    outname = output + f"/{genes}_genes.fna"

    genefams = selectFamilies(pangenome, genes, "gene nucleotide sequences", soft_core)
    genes_to_write = []

    for fam in genefams:
        genes_to_write.extend(fam.genes)

    logging.getLogger().info(f"There are {len(genes_to_write)} genes to write")
    with write_compressed_or_not(outname, compress) as fasta:
        if pangenome.status["geneSequences"] in ["inFile"]:
            getGeneSequencesFromFile(pangenome.file, fasta, set([gene.ID for gene in genes_to_write]),
                                     disable_bar=disable_bar)
        elif pangenome.status["geneSequences"] in ["Computed", "Loaded"]:
            writeGeneSequencesFromAnnotations(pangenome, fasta, genes_to_write, disable_bar=disable_bar)
        else:
            # this should never happen if the pangenome has been properly checked before launching this function.
            raise Exception("The pangenome does not include gene sequences")
    logging.getLogger().info(f"Done writing the gene sequences : '{outname}'")
Example #24
0
def writeGenePresenceAbsence(output, compress=False):
    logging.getLogger().info(f"Writing the gene presence absence file ...")
    outname = output + "/gene_presence_absence.Rtab"
    with write_compressed_or_not(outname,compress) as matrix:
        index_org = {}
        default_dat = []
        for index, org in enumerate(pan.organisms):
            default_dat.append('0')
            index_org[org] = index

        matrix.write('\t'.join(['Gene']#14
                                +[str(org) for org in pan.organisms])+"\n")#15
        default_genes =  ["0"] * len(pan.organisms)
        org_index = pan.getIndex()#should just return things
        for fam in pan.geneFamilies:
            genes = default_genes.copy()
            for org in fam.organisms:
                genes[org_index[org]] = "1"

            matrix.write('\t'.join([fam.name]#14
                                    +genes)+"\n")#15
    logging.getLogger().info(f"Done writing the gene presence absence file : '{outname}'")
Example #25
0
def writeRGPModules(output, compress):
    logging.getLogger().info("Clustering RGPs based on module content...")

    lists = write_compressed_or_not(output + "/modules_RGP_lists.tsv", compress)
    lists.write("representative_RGP\tnb_spots\tmod_list\tRGP_list\n")
    fam2mod = {}
    for mod in pan.modules:
        for fam in mod.families:
            fam2mod[fam] = mod

    region2spot = {}
    for spot in pan.spots:
        for region in spot.regions:
            region2spot[region] = spot

    mod_group2rgps = defaultdict(list)

    for region in pan.regions:
        curr_mod_list = set()
        for fam in region.families:
            mod = fam2mod.get(fam)
            if mod is not None:
                curr_mod_list.add(mod)
        if curr_mod_list != set():
            mod_group2rgps[frozenset(curr_mod_list)].append(region)

    for mod_list, regions in mod_group2rgps.items():
        spot_list = set()
        for region in regions:
            myspot = region2spot.get(region)
            if myspot is not None:
                spot_list.add(region2spot[region])
        lists.write(f"{regions[0].name}\t{len(spot_list)}\t{','.join(['module_' + str(mod.ID) for mod in mod_list])}\t"
                    f"{','.join([reg.name for reg in regions])}\n")
    lists.close()

    logging.getLogger().info(f"RGP and associated modules are listed in : {output + '/modules_RGP_lists.tsv'}")
Example #26
0
def writeFastaProtFam(pangenome,
                      output,
                      compress,
                      prot_families,
                      show_bar=False):
    outname = output + f"/{prot_families}_protein_families.faa"

    genefams = set()
    if prot_families == 'all':
        logging.getLogger().info(
            "Writing all of the representative amino acid sequences of the gene families..."
        )
        genefams = pangenome.geneFamilies
    if prot_families in ['persistent', 'shell', 'cloud']:
        logging.getLogger().info(
            f"Writing the representative amino acid sequences of the {prot_families} gene families..."
        )
        for fam in pangenome.geneFamilies:
            if fam.namedPartition == prot_families:
                genefams.add(fam)
    if prot_families == "rgp":
        logging.getLogger().info(
            f"Writing the representative amino acid sequences of the gene families in RGPs..."
        )
        for region in pangenome.regions:
            genefams |= region.families

    with write_compressed_or_not(outname, compress) as fasta:
        bar = tqdm(genefams, unit="prot families", disable=not show_bar)
        for fam in bar:
            fasta.write('>' + fam.name + "\n")
            fasta.write(fam.sequence + "\n")
        bar.close()
    logging.getLogger().info(
        f"Done writing the representative amino acid sequences of the gene families : '{outname}'"
    )
Example #27
0
def writeStats(output, soft_core, dup_margin, compress=False):
    logging.getLogger().info("Writing pangenome statistics...")
    logging.getLogger().info("Writing statistics on persistent duplication...")
    single_copy_markers = set()#could use bitarrays if speed is needed
    with write_compressed_or_not(output + "/mean_persistent_duplication.tsv", compress) as outfile:
        outfile.write(f"#duplication_margin={round(dup_margin,3)}\n")
        outfile.write("\t".join(["persistent_family","duplication_ratio","mean_presence","is_single_copy_marker"]) + "\n")
        for fam in pan.geneFamilies:
            if fam.namedPartition == "persistent":
                mean_pres = len(fam.genes) / len(fam.organisms)
                nb_multi = 0
                for gene_list in fam.getOrgDict().values():
                    if len(gene_list) > 1:
                        nb_multi +=1
                dup_ratio = nb_multi / len(fam.organisms)
                is_SCM = False
                if dup_ratio < dup_margin:
                    is_SCM = True
                    single_copy_markers.add(fam)
                outfile.write("\t".join([fam.name,
                                         str(round(dup_ratio,3)),
                                         str(round(mean_pres,3)),
                                         str(is_SCM)]) + "\n")
    logging.getLogger().info("Done writing stats on persistent duplication")
    logging.getLogger().info("Writing genome per genome statistics (completeness and counts)...")
    soft = set()#could use bitarrays if speed is needed
    core = set()
    for fam in pan.geneFamilies:
        if len(fam.organisms) >= pan.number_of_organisms() * soft_core:
            soft.add(fam)
        if len(fam.organisms) == pan.number_of_organisms():
            core.add(fam)

    with write_compressed_or_not(output + "/organisms_statistics.tsv", compress) as outfile:
        outfile.write(f"#soft_core={round(soft_core,3)}\n")
        outfile.write(f"#duplication_margin={round(dup_margin,3)}\n")
        outfile.write("\t".join(["organism","nb_families","nb_persistent_families","nb_shell_families","nb_cloud_families","nb_exact_core","nb_soft_core","nb_genes","nb_persistent_genes","nb_shell_genes","nb_cloud_genes","nb_exact_core_genes","nb_soft_core_genes","completeness","nb_single_copy_markers"]) + "\n")

        for org in pan.organisms:
            fams = org.families
            nb_pers = 0
            nb_shell = 0
            nb_cloud = 0
            for fam in fams:
                if fam.namedPartition == "persistent":
                    nb_pers+=1
                elif fam.namedPartition == "shell":
                    nb_shell+=1
                else:
                    nb_cloud+=1

            nb_gene_pers = 0
            nb_gene_shell = 0
            nb_gene_soft = 0
            nb_gene_cloud = 0
            nb_gene_core = 0
            for gene in org.genes:
                if gene.family.namedPartition == "persistent":
                    nb_gene_pers +=1
                elif gene.family.namedPartition == "shell":
                    nb_gene_shell +=1
                else:
                    nb_gene_cloud += 1
                if gene.family in soft:
                    nb_gene_soft+=1
                    if gene.family in core:
                        nb_gene_core+=1
            completeness = "NA"
            if len(single_copy_markers) > 0:
                completeness = round((len(fams & single_copy_markers) / len(single_copy_markers))*100,2)
            outfile.write("\t".join(map(str,[org.name,
                                    len(fams),
                                    nb_pers,
                                    nb_shell,
                                    nb_cloud,
                                    len(core & fams),
                                    len(soft & fams),
                                    org.number_of_genes(),
                                    nb_gene_pers,
                                    nb_gene_shell,
                                    nb_gene_cloud,
                                    nb_gene_core,
                                    nb_gene_soft,
                                    completeness,
                                    len(fams & single_copy_markers)])) + "\n")

    logging.getLogger().info("Done writing genome per genome statistics")
Example #28
0
def writeMatrix(sep, ext, output, compress=False, geneNames = False):
    logging.getLogger().info(f"Writing the .{ext} file ...")
    outname = output + "/matrix." + ext
    with write_compressed_or_not(outname,compress) as matrix:

        index_org = {}
        default_dat = []
        for index, org in enumerate(pan.organisms):
            default_dat.append('0')
            index_org[org] = index

        matrix.write(sep.join(['"Gene"',#1
                                '"Non-unique Gene name"',#2
                                '"Annotation"',#3
                                '"No. isolates"',#4
                                '"No. sequences"',#5
                                '"Avg sequences per isolate"',#6
                                '"Accessory Fragment"',#7
                                '"Genome Fragment"',#8
                                '"Order within Fragment"',#9
                                '"Accessory Order with Fragment"',#10
                                '"QC"',#11
                                '"Min group size nuc"',#12
                                '"Max group size nuc"',#13
                                '"Avg group size nuc"']#14
                                +['"'+str(org)+'"' for org in pan.organisms])+"\n")#15
        default_genes = ['""'] * len(pan.organisms) if geneNames else ["0"] * len(pan.organisms)
        org_index = pan.getIndex()#should just return things
        for fam in pan.geneFamilies:
            genes = default_genes.copy()
            l = []
            alt = fam.namedPartition if fam.partition != "" else False
            genenames = Counter()
            product = Counter()
            for org, gene_list in fam.getOrgDict().items():
                genes[org_index[org]] = " ".join([ '"' + str(gene) + '"' for gene in gene_list]) if geneNames else str(len(gene_list))
                for gene in gene_list:
                    l.append(gene.stop - gene.start)
                    product[gene.product] +=1
                    genenames[gene.name] += 1

            if fam.partition != "":
                alt = fam.namedPartition
            else:
                alt = str(product.most_common(1)[0][0])

            l = [ gene.stop - gene.start for gene in fam.genes ]
            matrix.write(sep.join(['"'+fam.name+'"',#1
                                    '"'+alt+'"',#2
                                    '"'+ str(product.most_common(1)[0][0])  +'"',#3
                                    '"' + str(len(fam.organisms)) + '"',#4
                                    '"' + str(len(fam.genes)) + '"',#5
                                    '"' + str(round(len(fam.genes)/len(fam.organisms),2)) + '"',#6
                                    '"NA"',#7
                                    '"NA"',#8
                                    '""',#9
                                    '""',#10
                                    '""',#11
                                    '"' + str(min(l)) + '"',#12
                                    '"' + str(max(l)) + '"',#13
                                    '"' + str(round(sum(l)/len(l),2)) + '"']#14
                                    +genes)+"\n")#15
    logging.getLogger().info(f"Done writing the matrix : '{outname}'")