def predictHotspots(pangenome, output, force=False, cpu=1, spot_graph=False, overlapping_match=2, set_size=3, exact_match=1, disable_bar=False): # check that given parameters for hotspot computation make sense checkParameterLogic(overlapping_match, set_size, exact_match) # check for formerly computed stuff, and erase if allowed checkPangenomeFormerSpots(pangenome, force) # check statuses and load info checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needGraph=False, needPartitions=True, needRGP=True, disable_bar=disable_bar) # get multigenic gene families logging.getLogger().info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) logging.getLogger().info("Detecting hotspots in the pangenome...") # predict spots spots = makeSpotGraph(pangenome.regions, multigenics, output, spot_graph, overlapping_match, set_size, exact_match) if len(spots) == 0: logging.getLogger().warning("No spots were detected.") else: logging.getLogger().info(f"{len(spots)} spots were detected") pangenome.addSpots(spots) pangenome.status["spots"] = "Computed" pangenome.parameters["spots"] = {} pangenome.parameters["spots"]["set_size"] = set_size pangenome.parameters["spots"]["overlapping_match"] = overlapping_match pangenome.parameters["spots"]["exact_match"] = exact_match
def predictRGP(pangenome, force=False, persistent_penalty=3, variable_gain=1, min_length=3000, min_score=4, dup_margin=0.05, cpu=1): #check statuses and load info checkPangenomeFormerRGP(pangenome, force) checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needGraph=False, needPartitions=True) logging.getLogger().info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(dup_margin) logging.getLogger().info("Compute Regions of Genomic Plasticity ...") bar = tqdm(pangenome.organisms, unit="genomes") for org in bar: pangenome.addRegions( compute_org_rgp(org, persistent_penalty, variable_gain, min_length, min_score, multigenics)) logging.getLogger().info(f"Predicted {len(pangenome.regions)} RGP") #save parameters and save status pangenome.parameters["RGP"] = {} pangenome.parameters["RGP"]["persistent_penalty"] = persistent_penalty pangenome.parameters["RGP"]["variable_gain"] = variable_gain pangenome.parameters["RGP"]["min_length"] = min_length pangenome.parameters["RGP"]["min_score"] = min_score pangenome.parameters["RGP"]["dup_margin"] = dup_margin pangenome.status['predictedRGP'] = "Computed"
def writeSequenceFiles(pangenome, output, fasta=None, anno=None, soft_core=0.95, regions=None, genes=None, gene_families=None, prot_families=None, compress=False, disable_bar=False): if not any(x for x in [regions, genes, prot_families, gene_families]): raise Exception("You did not indicate what file you wanted to write.") needAnnotations = False needFamilies = False needGraph = False needPartitions = False needSpots = False needRegions = False needModules = False if any(x is not None for x in [regions, genes, gene_families, prot_families]): needAnnotations = True needFamilies = True if regions is not None or any(x == "rgp" for x in (genes, gene_families, prot_families)): needRegions = True if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)): needPartitions = True for x in (genes, gene_families, prot_families): if x is not None and 'module_' in x: needModules = True if not (needAnnotations or needFamilies or needGraph or needPartitions or needSpots or needRegions or needModules): # then nothing is needed, then something is wrong. # find which filter was provided provided_filter = '' if genes is not None: provided_filter = genes if gene_families is not None: provided_filter = gene_families if prot_families is not None: provided_filter = prot_families if regions is not None: provided_filter = regions raise Exception( f"The filter that you indicated '{provided_filter}' was not understood by PPanGGOLiN. {poss_values_log}") ex_geneSequences = Exception("The provided pangenome has no gene sequences. " "This is not compatible with any of the following options : --genes, --gene_families") ex_geneFamilySequences = Exception("The provided pangenome has no gene families. " "This is not compatible with any of the following options : " "--prot_families, --gene_families") if not pangenome.status["geneSequences"] in ["inFile"] and (genes or gene_families): raise ex_geneSequences if not pangenome.status["geneFamilySequences"] in ["Loaded", "Computed", "inFile"] and prot_families: raise ex_geneFamilySequences checkPangenomeInfo(pangenome, needAnnotations=needAnnotations, needFamilies=needFamilies, needGraph=needGraph, needPartitions=needPartitions, needRGP=needRegions, needSpots=needSpots, needModules=needModules, disable_bar=disable_bar) if prot_families is not None: writeFastaProtFam(pangenome, output, compress, prot_families, soft_core=soft_core, disable_bar=disable_bar) if gene_families is not None: writeFastaGeneFam(pangenome, output, compress, gene_families, soft_core=soft_core, disable_bar=disable_bar) if genes is not None: writeGeneSequences(pangenome, output, compress, genes, soft_core=soft_core, disable_bar=disable_bar) if regions is not None: writeRegionsSequences(pangenome, output, compress, regions, fasta, anno, disable_bar=disable_bar)
def readClustering(pangenome, families_tsv_file, infer_singletons=False, force=False): """ Creates the pangenome, the gene families and the genes with an associated gene family. Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome. """ checkPangenomeFormerClustering(pangenome, force) checkPangenomeInfo(pangenome, needAnnotations=True) logging.getLogger().info("Reading " + families_tsv_file + " the gene families file ...") filesize = os.stat(families_tsv_file).st_size families_tsv_file = read_compressed_or_not(families_tsv_file) frag = False #the genome annotations are necessarily loaded. nbGeneWtFam = 0 bar = tqdm(total=filesize, unit="bytes") for line in families_tsv_file: bar.update(len(line)) elements = [el.strip() for el in line.split()] # 2 or 3 fields expected if len(elements) <= 1: logging.getLogger().error( "No tabulation separator found in gene families file") exit(1) (fam_id, gene_id, is_frag) = elements if len(elements) == 3 else elements + [None] geneObj = pangenome.getGene(gene_id) if geneObj is not None: nbGeneWtFam += 1 fam = pangenome.addGeneFamily(fam_id) geneObj.is_fragment = True if is_frag == "F" else False fam.addGene(geneObj) if is_frag == "F": frag = True bar.close() families_tsv_file.close() if nbGeneWtFam < len( pangenome.genes): #not all genes have an associated cluster if nbGeneWtFam == 0: raise Exception( "No gene ID in the cluster file matched any gene ID from the annotation step. Please ensure that the annotations that you loaded previously and the clustering results that you have use the same gene IDs." ) else: if infer_singletons: inferSingletons(pangenome) else: raise Exception( "Some genes did not have an associated cluster. Either change your cluster file so that each gene has a cluster, or use the --infer_singletons option to infer a cluster for each non-clustered gene." ) pangenome.status["genesClustered"] = "Computed" if frag: #if there was fragment informations in the file. pangenome.status["defragmented"] = "Computed" pangenome.parameters["cluster"] = {} pangenome.parameters["cluster"]["read_clustering_from_file"] = True pangenome.parameters["cluster"]["infer_singletons"] = infer_singletons
def writeFlatFiles(pangenome, output, cpu = 1, soft_core = 0.95, dup_margin = 0.05, csv=False, genePA = False, gexf = False, light_gexf = False, projection = False, stats = False, json = False, partitions=False,regions = False, families_tsv = False, spots = False, borders=False, compress = False): if not any(x for x in [csv, genePA, gexf, light_gexf, projection, stats, json, partitions, regions, spots, borders, families_tsv]): raise Exception("You did not indicate what file you wanted to write.") global pan pan = pangenome processes = [] needAnnotations = False needFamilies = False needGraph = False needPartitions = False needSpots = False needRegions = False if csv or genePA or gexf or light_gexf or projection or stats or json or partitions or regions or spots or families_tsv or borders: needAnnotations = True if csv or genePA or gexf or light_gexf or projection or stats or json or partitions or regions or spots or families_tsv or borders: needFamilies = True if projection or stats or partitions or regions or spots or borders: needPartitions = True if gexf or light_gexf or json: needGraph = True if regions or spots or borders: needRegions = True if spots or borders: needSpots = True checkPangenomeInfo(pan, needAnnotations=needAnnotations, needFamilies=needFamilies, needGraph=needGraph, needPartitions= needPartitions, needRGP = needRegions, needSpots = needSpots) pan.getIndex()#make the index because it will be used most likely with Pool(processes = cpu) as p: if csv: processes.append(p.apply_async(func = writeMatrix, args = (',', "csv", output, compress, True))) if genePA: processes.append(p.apply_async(func = writeGenePresenceAbsence, args = (output, compress))) if gexf: processes.append(p.apply_async(func = writeGEXF, args = (output, False, soft_core, compress))) if light_gexf: processes.append(p.apply_async(func = writeGEXF, args = (output, True, soft_core, compress))) if projection: processes.append(p.apply_async(func = writeProjections, args = (output, compress))) if stats: processes.append(p.apply_async(func = writeStats, args = (output, soft_core, dup_margin, compress))) if json: processes.append(p.apply_async(func = writeJSON, args = (output, compress))) if partitions: processes.append(p.apply_async(func = writeParts, args = (output, soft_core, compress))) if families_tsv: processes.append(p.apply_async(func = writeGeneFamiliesTSV, args = (output, compress))) if regions: processes.append(p.apply_async(func = writeRegions, args = (output, compress))) if spots: processes.append(p.apply_async(func = writeSpots, args=(output, compress))) if borders: processes.append(p.apply_async(func=writeBorders, args=(output, dup_margin, compress))) for process in processes: process.get()#get all the results
def align(pangenome, proteinFile, output, tmpdir, identity=0.8, coverage=0.8, defrag=False, cpu=1, getinfo=False, draw_related=False): if pangenome.status["geneFamilySequences"] not in [ "inFile", "Loaded", "Computed" ]: raise Exception( "Cannot use this function as your pangenome does not have gene families representatives associated to it. For now this works only if the clustering is realised by PPanGGOLiN." ) if getinfo: checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needRGP=True, needPartitions=True, needSpots=True) else: checkPangenomeInfo(pangenome, needFamilies=True) newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) tmpPangFile = tempfile.NamedTemporaryFile(mode="w", dir=newtmpdir.name) writeGeneFamSequences(pangenome, tmpPangFile) with read_compressed_or_not(proteinFile) as protFileObj: protSet = getProt(protFileObj) alignFile = alignSeqToPang(tmpPangFile, protFileObj, output, newtmpdir, cpu, defrag, identity, coverage) prot2pang = readAlignments(alignFile, pangenome) if getinfo: getProtInfo(prot2pang, pangenome, output, cpu, draw_related) else: partProj = projectPartition( prot2pang, protSet, output) #write the partition assignation only logging.getLogger().info( f"proteins partition projection : '{partProj}'") logging.getLogger().info( f"{len(prot2pang)} proteins over {len(protSet)} have at least one hit in the pangenome." ) logging.getLogger().info( f"Blast-tab file of the alignment : '{alignFile}'") tmpPangFile.close() newtmpdir.cleanup()
def drawSpots(pangenome, output, spot_list, disable_bar): # check that the pangenome has spots checkPredictedSpots(pangenome) needMod = False if pangenome.status["modules"] != "No": # modules are not required to be loaded, but if they have been computed we load them. needMod = True checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needGraph=False, needPartitions=True, needRGP=True, needSpots=True, needModules=needMod, disable_bar=disable_bar) selected_spots = set() curated_spot_list = [ 'spot_' + str(s) if 'spot' not in s else str(s) for s in spot_list.split(',') ] if spot_list == 'all' or any(x == 'all' for x in curated_spot_list): selected_spots = [ s for s in pangenome.spots if len(s.getUniqOrderedSet()) > 1 ] else: selected_spots = [ s for s in pangenome.spots if "spot_" + str(s.ID) in curated_spot_list ] if len(selected_spots) < 10: logging.getLogger().info( f"Drawing the following spots: {','.join(['spot_' + str(s.ID) for s in selected_spots])}" ) else: logging.getLogger().info(f"Drawing {len(selected_spots)} spots") drawSelectedSpots( selected_spots, pangenome, output, overlapping_match=pangenome.parameters["spots"]["overlapping_match"], exact_match=pangenome.parameters["spots"]["exact_match"], set_size=pangenome.parameters["spots"]["set_size"], disable_bar=disable_bar)
def genomes_fluidity(pangenome, disable_bar=False): """ Compute the genomes fluidity from the pangenome :param pangenome: pangenome which will be used to compute the genomes fluidity :type pangenome: Pangenome :param disable_bar: Disable the progress bar :type disable_bar: bool :return: Genomes fluidity value from the pangenome :rtype:float """ # check statuses and load info logging.getLogger().info("Check information in pangenome") checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, disable_bar=disable_bar) fluidity_dict = { 'all': None, 'shell': None, 'cloud': None, 'accessory': None } for subset in fluidity_dict.keys(): logging.getLogger().debug(f"Compute binaries for {subset} partition") pangenome.compute_org_bitarrays(part=subset) # Compute binaries corresponding to presence / absence of families in organisms g_sum = 0 logging.getLogger().debug("Get number of families in each organisms") org2_nb_fam = nb_fam_per_org(pangenome, disable_bar) logging.getLogger().info( f"Compute rate of unique family for each genome combination in {subset}" ) for c_organisms in tqdm(list(combinations(pangenome.organisms, 2)), unit="combination", disable=disable_bar): tot_fam = org2_nb_fam.get(c_organisms[0].name) + org2_nb_fam.get( c_organisms[1].name) common_fam = popcount(c_organisms[0].bitarray & c_organisms[1].bitarray) - 1 if tot_fam > 0 and common_fam > 0: g_sum += (tot_fam - 2 * common_fam) / tot_fam fluidity_dict[subset] = ( 2 / (pangenome.number_of_organisms() * (pangenome.number_of_organisms() - 1))) * g_sum return fluidity_dict
def predictHotspots(pangenome, output, force=False, cpu = 1, spot_graph = False, overlapping_match = 2, set_size = 3, exact_match = 1, draw_hotspot = False, interest = ""): #check that given parameters for hotspot computation make sense checkParameterLogic(overlapping_match, set_size, exact_match) #check for formerly computed stuff, and erase if allowed checkPangenomeFormerSpots(pangenome, force) #check statuses and load info checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needGraph=False, needPartitions = True, needRGP=True) #get multigenic gene families logging.getLogger().info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) logging.getLogger().info("Detecting hotspots in the pangenome...") #predict spots spots = makeSpotGraph(pangenome.regions, multigenics, output, spot_graph, overlapping_match, set_size, exact_match) #define elements of interest (e.g. gene name, product substring) to search in gene annotations if interest != "": elements = [ el.strip() for el in interest.split(',') ] else: elements = [] #draw spots of interest if draw_hotspot: drawn_spots = select_spots(pangenome, spots, elements) if len(drawn_spots)>0: draw_spots(drawn_spots, output, cpu, overlapping_match, exact_match, set_size, multigenics, elements) pangenome.addSpots(spots) pangenome.status["spots"] = "Computed" pangenome.parameters["spots"] = {} pangenome.parameters["spots"]["set_size"] = set_size pangenome.parameters["spots"]["overlapping_match"] = overlapping_match pangenome.parameters["spots"]["exact_match"] = exact_match
def writeMSAFiles(pangenome, output, cpu=1, partition="core", tmpdir="/tmp", source="protein", force=False, show_bar=True): needPartitions = False if partition in ["persistent", "shell", "cloud"]: needPartitions = True outname = output + f"/msa_{partition}_{source}/" mkOutdir(outname, force=force) checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needPartitions=needPartitions, needGeneSequences=True, show_bar=show_bar) logging.getLogger().info(f"computing msa for {partition} families...") families = getFamiliesToWrite(pangenome, partitionFilter=partition) #this must exist since we loaded the pangenome and families are required code = pangenome.parameters["cluster"]["translation_table"] computeMSA(families, outname, cpu=cpu, tmpdir=tmpdir, source=source, code=code, show_bar=show_bar) logging.getLogger().info(f"Done writing all {partition} MSA in: {outname}")
def projectRGP(pangenome, annotation, output, tmpdir, identity=0.8, coverage=0.8, defrag=False, cpu=1, translation_table=11): if pangenome.status["geneFamilySequences"] not in [ "inFile", "Loaded", "Computed" ]: raise Exception( "Cannot use this function as your pangenome does not have gene families representatives associated to it. For now this works only if the clustering is realised by PPanGGOLiN." ) #read given file logging.getLogger().info("Retrieving the annotations from the given file") singleOrgPang = Pangenome( ) #need to create a new 'pangenome' as the annotation reading functions take a pangenome as input. filetype = detect_filetype(annotation) if filetype == "gff": singleOrgPang.status[ "geneSequences"] = "Computed" #if there are no sequences in the gff, this value will change to 'No' read_org_gff(singleOrgPang, 'myGenome', annotation, [], True) if singleOrgPang.status["geneSequences"] == "No": raise Exception( f"The given annotation file did not have a FASTA sequence included (expected '##FASTA' pragma followed by a fasta-like file format). This is required for computing the Regions of Genomic Plasticity of your organism" ) elif filetype == "gbff": read_org_gbff(singleOrgPang, 'myGenome', annotation, [], True) #check and read given pangenome checkPangenomeInfo(pangenome, needFamilies=True, needPartitions=True, needAnnotations=True) newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) tmpPangFile = tempfile.NamedTemporaryFile(mode="w", dir=newtmpdir.name) tmpGeneFile = tempfile.NamedTemporaryFile(mode="w", dir=newtmpdir.name) writeGeneSequencesFromAnnotations(singleOrgPang, tmpGeneFile) writeGeneFamSequences(pangenome, tmpPangFile) blastout = alignSeqToPang(tmpPangFile, tmpGeneFile, output, newtmpdir, cpu, defrag, identity, coverage, True, translation_table) tmpPangFile.close() tmpGeneFile.close() newtmpdir.cleanup() #artificially reconstruct the gene families and their partitions linkNewGenomeFamilies(singleOrgPang, pangenome, blastout) multigenics = pangenome.get_multigenics( pangenome.parameters["RGP"]["dup_margin"]) genomeMultigenics = linkMultigenicFamilies(singleOrgPang, multigenics) logging.getLogger().info("Predicting RGP in your genome") for org in singleOrgPang.organisms: genomeRGP = compute_org_rgp( org, pangenome.parameters["RGP"]["persistent_penalty"], pangenome.parameters["RGP"]["variable_gain"], pangenome.parameters["RGP"]["min_length"], pangenome.parameters["RGP"]["min_score"], genomeMultigenics) if filetype == "gff": #reread the file and insert sequence_feature objects corresponding to the predicted regions logging.getLogger().info("Writing the RGP in a gff file...") writeGffRegions(annotation, genomeRGP, output) elif filetype == "gbff": logging.getLogger().info("Writing the RGP in a gbff file...") writeGbffRegions(annotation, genomeRGP, output)
def drawTilePlot(pangenome, output, nocloud=False): checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needGraph=True) if pangenome.status["partitionned"] == "No": raise Exception( "Cannot draw the tile plot as your pangenome has not been partitionned" ) if len(pangenome.organisms) > 500 and nocloud is False: logging.getLogger().warning( "You asked to draw a tile plot for a lot of organisms (>500). Your browser will probably not be able to open it." ) logging.getLogger().info("Drawing the tile plot...") data = [] all_indexes = [] all_columns = [] fam2index = {} index2fam = {} if nocloud: families = { fam for fam in pangenome.geneFamilies if not fam.partition.startswith("C") } else: families = set(pangenome.geneFamilies) org_index = pangenome.getIndex() index2org = {} for org, index in org_index.items(): index2org[index] = org COLORS = { "pangenome": "black", "exact_accessory": "#EB37ED", "exact_core": "#FF2828", "soft_core": "#c7c938", "soft_accessory": "#996633", "shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF", "undefined": "#828282" } logging.getLogger().info("start with matrice") for row, fam in enumerate(families): new_col = [org_index[org] for org in fam.organisms] all_indexes.extend([row] * len(new_col)) all_columns.extend(new_col) data.extend([1.0] * len(new_col)) index2fam[row] = fam.name fam2index[fam.name] = row mat_p_a = csc_matrix((data, (all_indexes, all_columns)), shape=(len(families), len(pangenome.organisms)), dtype='float') dist = pdist(1 - jaccard_similarities(mat_p_a, 0).todense()) hc = linkage(dist, 'single') dendro = dendrogram(hc, no_plot=True) logging.getLogger().info( "done with making the dendrogram to order the organisms on the plot") order_organisms = [index2org[index] for index in dendro["leaves"]] binary_data = [] text_data = [] fam_order = [] partitions_dict = defaultdict(list) shell_subs = set() for fam in families: partitions_dict[fam.partition].append(fam) if fam.partition.startswith("S"): shell_subs.add( fam.partition ) #number of elements will tell the number of subpartitions ordered_nodes = [] ordored_nodes_p = sorted(partitions_dict["P"], key=lambda n: len(n.organisms), reverse=True) ordored_nodes_c = sorted(partitions_dict["C"], key=lambda n: len(n.organisms), reverse=True) sep_p = len(ordored_nodes_p) - 0.5 separators = [sep_p] shell_NA = None if len(shell_subs) == 1: ordored_nodes_s = sorted(partitions_dict[shell_subs.pop()], key=lambda n: len(n.organisms), reverse=True) ordered_nodes = ordored_nodes_p + ordored_nodes_s + ordored_nodes_c separators.append(separators[len(separators) - 1] + len(ordored_nodes_s)) separators.append(separators[len(separators) - 1] + len(ordored_nodes_c)) else: ordered_nodes = ordored_nodes_p for subpartition in sorted(shell_subs): if subpartition == "S_": shell_NA = len(separators) - 1 ordored_nodes_s = sorted(partitions_dict[subpartition], key=lambda n: len(n.organisms), reverse=True) ordered_nodes += ordored_nodes_s separators.append(separators[len(separators) - 1] + len(ordored_nodes_s)) ordered_nodes += ordored_nodes_c separators.append(separators[len(separators) - 1] + len(ordored_nodes_c)) logging.getLogger().info( "Getting the gene name(s) and the number for each tile of the plot ..." ) for node in ordered_nodes: fam_order.append('\u200c' + node.name) data = node.organisms binary_data.append([ len(node.getGenesPerOrg(org)) if org in data else numpy.nan for org in order_organisms ]) text_data.append([("\n".join(map(str, node.getGenesPerOrg(org)))) if org in data else numpy.nan for org in order_organisms]) xaxis_values = ['\u200c' + org.name for org in order_organisms] logging.getLogger().info( "Done extracting names and numbers. Making the heatmap ...") heatmap = go.Heatmap(z=binary_data, x=xaxis_values, y=fam_order, text=text_data, zauto=False, zmin=1, zmax=2, autocolorscale=False, colorscale=[[0.50, 'rgb(100, 15, 78)'], [1, 'rgb(59, 157, 50)']], colorbar=dict(title='Presence/Absence', titleside='top', tickmode='array', tickvals=[1, 2], ticktext=['Presence', 'Multicopy'], ticks='outside')) shell_color = None if len(shell_subs) > 1: if "S_" not in shell_subs: shell_color = cl.interp(cl.flipper()['seq']['9']['Greens'][1:7], len(shell_subs)) else: shell_color = cl.interp(cl.flipper()['seq']['9']['Greens'][1:7], len(shell_subs) - 1) shapes = [] sep_prec = 0 for nb, sep in enumerate(separators): color = None if nb == 0: color = COLORS["persistent"] elif nb == (len(separators) - 1): color = COLORS["cloud"] elif len(shell_subs) > 1: if shell_NA is not None and nb == shell_NA: color = COLORS["shell"] else: color = shell_color.pop() else: color = COLORS["shell"] shapes.append( dict(type='line', x0=-1, x1=-1, y0=sep_prec, y1=sep, line=dict(dict(width=10, color=color)))) shapes.append( dict(type='line', x0=len(pangenome.organisms), x1=len(pangenome.organisms), y0=sep_prec, y1=sep, line=dict(dict(width=10, color=color)))) shapes.append( dict(type='line', x0=-1, x1=len(pangenome.organisms), y0=sep, y1=sep, line=dict(dict(width=1, color=color)))) sep_prec = sep layout = go.Layout(title="presence/absence matrix", xaxis=go.layout.XAxis(ticktext=xaxis_values, title='organisms', tickvals=xaxis_values, automargin=True, tickfont=dict(size=10)), yaxis=go.layout.YAxis(ticktext=fam_order, tickvals=fam_order, title='gene families', automargin=True, tickfont=dict(size=10)), shapes=shapes, plot_bgcolor='#ffffff') logging.getLogger().info("Drawing the figure itself...") out_plotly.plot(go.Figure(data=[heatmap], layout=layout), filename=output + "/tile_plot.html", auto_open=False) logging.getLogger().info( f"Done with the tile plot : '{output+'/tile_plot.html'}' ")
def drawUCurve(pangenome, output, soft_core=0.95, disable_bar=False): checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needGraph=True, disable_bar=disable_bar) logging.getLogger().info("Drawing the U-shaped curve...") max_bar = 0 count = defaultdict(lambda: defaultdict(int)) is_partitioned = False has_undefined = False for fam in pangenome.geneFamilies: nb_org = len(fam.organisms) if fam.partition != "": is_partitioned = True if fam.partition == "U": has_undefined = True count[nb_org][fam.namedPartition] += 1 count[nb_org]["pangenome"] += 1 max_bar = count[nb_org][ "pangenome"] if count[nb_org]["pangenome"] > max_bar else max_bar data_plot = [] chao = "NA" if count[1]["pangenome"] > 0: chao = round( len(pangenome.geneFamilies) + ((count[0]["pangenome"] ^ 2) / (count[1]["pangenome"] * 2)), 2) COLORS = { "pangenome": "black", "exact_accessory": "#EB37ED", "exact_core": "#FF2828", "soft_core": "#c7c938", "soft_accessory": "#996633", "shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF", "undefined": "#828282" } if is_partitioned and not has_undefined: persistent_values = [] shell_values = [] cloud_values = [] for nb_org in range(1, len(pangenome.organisms) + 1): persistent_values.append(count[nb_org]["persistent"]) shell_values.append(count[nb_org]["shell"]) cloud_values.append(count[nb_org]["cloud"]) data_plot.append( go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=persistent_values, name='persistent', marker=dict(color=COLORS["persistent"]))) data_plot.append( go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=shell_values, name='shell', marker=dict(color=COLORS["shell"]))) data_plot.append( go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=cloud_values, name='cloud', marker=dict(color=COLORS["cloud"]))) else: text = 'undefined' if has_undefined else "pangenome" undefined_values = [] for nb_org in range(1, len(pangenome.organisms) + 1): undefined_values.append(count[nb_org][text]) data_plot.append( go.Bar(x=list(range(1, len(pangenome.organisms) + 1)), y=undefined_values, name=text, marker=dict(color=COLORS[text]))) x = len(pangenome.organisms) * soft_core layout = go.Layout( title="Gene families frequency distribution (U shape), chao=" + str(chao), xaxis=dict(title='Occurring in x genomes'), yaxis=dict(title='# of gene families (F)'), barmode='stack', shapes=[ dict(type='line', x0=x, x1=x, y0=0, y1=max_bar, line=dict(dict(width=5, dash='dashdot', color="grey"))) ], plot_bgcolor='#ffffff') fig = go.Figure(data=data_plot, layout=layout) out_plotly.plot(fig, filename=output + "/Ushaped_plot.html", auto_open=False) logging.getLogger().info( f"Done drawing the U-shaped curve : '{output + '/Ushaped_plot.html'}'")
def partition(pangenome, tmpdir, outputdir=None, beta=2.5, sm_degree=10, free_dispersion=False, chunk_size=500, K=-1, Krange=None, ICL_margin=0.05, draw_ICL=False, cpu=1, seed=42, keep_tmp_files=False): Krange = Krange or [3, 20] global pan global samples pan = pangenome if draw_ICL and outputdir is None: raise Exception( "Combination of option impossible: You asked to draw the ICL curves but did not provide an output directory!" ) checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needGraph=True) organisms = set(pangenome.organisms) tmpdirObj = tempfile.TemporaryDirectory(dir=tmpdir) tmpdir = tmpdirObj.name if len(organisms) <= 10: logging.getLogger().warning( f"The number of selected organisms is too low ({len(organisms)} organisms used) to robustly partition the graph" ) pangenome.parameters["partition"] = {} pangenome.parameters["partition"]["beta"] = beta pangenome.parameters["partition"]["free_dispersion"] = free_dispersion pangenome.parameters["partition"][ "max_node_degree_for_smoothing"] = sm_degree if len(organisms) > chunk_size: pangenome.parameters["partition"]["chunk_size"] = chunk_size pangenome.parameters["partition"]["computed_K"] = False if K < 3: pangenome.parameters["partition"]["computed_K"] = True logging.getLogger().info( "Estimating the optimal number of partitions...") K = evaluate_nb_partitions(organisms, sm_degree, free_dispersion, chunk_size, Krange, ICL_margin, draw_ICL, cpu, tmpdir, seed, outputdir) logging.getLogger().info( f"The number of partitions has been evaluated at {K}") pangenome.parameters["partition"]["K"] = K init = "param_file" partitionning_results = {} families = set() cpt = 0 cpt_partition = {} random.seed(seed) for fam in pangenome.geneFamilies: families.add(fam) if chunk_size < len(organisms): cpt_partition[fam.name] = {"P": 0, "S": 0, "C": 0, "U": 0} start_partitionning = time.time() logging.getLogger().info("Partitioning...") pansize = len(families) if chunk_size < len(organisms): validated = set() def validate_family(result): for node, nem_class in result[0].items(): cpt_partition[node][nem_class[0]] += 1 sum_partionning = sum(cpt_partition[node].values()) if (sum_partionning > len(organisms) / chunk_size and max(cpt_partition[node].values()) >= sum_partionning * 0.5) or (sum_partionning > len(organisms)): if node not in validated: if max(cpt_partition[node].values() ) < sum_partionning * 0.5: cpt_partition[node]["U"] = len( organisms ) #if despite len(select_organisms) partionning, an abosolute majority is not found then the families is set to undefined validated.add(node) org_nb_sample = Counter() for org in organisms: org_nb_sample[org] = 0 condition = len(organisms) / chunk_size while len(validated) < pansize: prev = len( samples ) #if we've been sampling already, samples is not empty. while not all( val >= condition for val in org_nb_sample.values() ): #each family must be tested at least len(select_organisms)/chunk_size times. shuffled_orgs = list(organisms) #copy select_organisms random.shuffle(shuffled_orgs) #shuffle the copied list while len(shuffled_orgs) > chunk_size: samples.append(set(shuffled_orgs[:chunk_size])) for org in samples[-1]: org_nb_sample[org] += 1 shuffled_orgs = shuffled_orgs[chunk_size:] args = [] # tmpdir, beta, sm_degree, free_dispersion, K, seed for i, _ in enumerate(samples[prev:], start=prev): args.append((i, tmpdir, beta, sm_degree, free_dispersion, K, seed, init, keep_tmp_files)) logging.getLogger().info("Launching NEM") with Pool(processes=cpu) as p: #launch partitionnings bar = tqdm(range(len(args)), unit=" samples partitionned") for result in p.imap_unordered(nemSamples, args): validate_family(result) bar.update() bar.close() condition += 1 #if len(validated) < pan_size, we will want to resample more. logging.getLogger().debug( f"There are {len(validated)} validated families out of {pansize} families." ) p.close() p.join() for fam, data in cpt_partition.items(): partitionning_results[fam] = max(data, key=data.get) ## need to compute the median vectors of each partition ??? partitionning_results = [partitionning_results, []] ##introduces a 'non feature'. logging.getLogger().info( f"Did {len(samples)} partitionning with chunks of size {chunk_size} among {len(organisms)} genomes in {round(time.time() - start_partitionning,2)} seconds." ) else: edges_weight, nb_fam = write_nem_input_files(tmpdir + "/" + str(cpt) + "/", organisms, sm_degree=sm_degree) partitionning_results = run_partitioning(tmpdir + "/" + str(cpt) + "/", len(organisms), beta * (nb_fam / edges_weight), free_dispersion, K=K, seed=seed, init=init, keep_files=keep_tmp_files) if partitionning_results == [{}, None, None]: raise Exception( "Statistical partitionning does not work on your data. This usually happens because you used very few (<15) genomes." ) cpt += 1 logging.getLogger().info( f"Partitionned {len(organisms)} genomes in {round(time.time() - start_partitionning,2)} seconds." ) # pangenome.savePartitionParameters(K, beta, free_dispersion, sm_degree, partitionning_results[1], chunk_size) for famName, partition in partitionning_results[0].items(): pangenome.getGeneFamily(famName).partition = partition pangenome.status["partitionned"] = "Computed" if not keep_tmp_files: tmpdirObj.cleanup() else: copytree(tmpdir, outputdir + "/NEM_files/")
def makeRarefactionCurve( pangenome, output, tmpdir, beta=2.5, depth = 30, minSampling =1, maxSampling = 100, sm_degree = 10, free_dispersion=False, chunk_size = 500, K=-1, cpu = 1, seed=42, kestimate = False, krange = [3,-1], soft_core = 0.95, show_bar=True): ppp.pan = pangenome#use the global from partition to store the pangenome, so that it is usable try: krange[0] = ppp.pan.parameters["partition"]["K"] if krange[0]<0 else krange[0] krange[1] = ppp.pan.parameters["partition"]["K"] if krange[1]<0 else krange[1] except KeyError: krange=[3,20] checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needGraph=True, show_bar=show_bar) tmpdirObj = tempfile.TemporaryDirectory(dir=tmpdir) tmpdir = tmpdirObj.name if float(len(pangenome.organisms)) < maxSampling: maxSampling = len(pangenome.organisms) else: maxSampling = int(maxSampling) if K < 3 and kestimate is False:#estimate K once and for all. try: K = ppp.pan.parameters["partition"]["K"] logging.getLogger().info(f"Reuse the number of partitions {K}") except KeyError: logging.getLogger().info("Estimating the number of partitions...") K = ppp.evaluate_nb_partitions(pangenome.organisms, sm_degree, free_dispersion, chunk_size, krange, 0.05, False, cpu, tmpdir, seed, None) logging.getLogger().info(f"The number of partitions has been evaluated at {K}") logging.getLogger().info("Extracting samples ...") AllSamples = [] for i in range(minSampling,maxSampling):#each point for _ in range(depth):#number of samples per points AllSamples.append(set(random.sample(set(pangenome.organisms), i+1))) logging.getLogger().info(f"Done sampling organisms in the pangenome, there are {len(AllSamples)} samples") SampNbPerPart = [] logging.getLogger().info("Computing bitarrays for each family...") index_org = pangenome.computeFamilyBitarrays() logging.getLogger().info(f"Done computing bitarrays. Comparing them to get exact and soft core stats for {len(AllSamples)} samples...") bar = tqdm( range(len(AllSamples) * len(pangenome.geneFamilies)), unit = "gene family", disable=not show_bar) for samp in AllSamples: #make the sample's organism bitarray. sampBitarray = gmpy2.xmpz(0)#pylint: disable=no-member for org in samp: sampBitarray[index_org[org]] = 1 part = Counter() part["soft_core"] = 0 part["exact_core"] = 0 part["exact_accessory"] = 0 part["soft_accessory"] = 0 for fam in pangenome.geneFamilies: nbCommonOrg = gmpy2.popcount(fam.bitarray & sampBitarray)#pylint: disable=no-member part["nborgs"] = len(samp) if nbCommonOrg != 0:#in that case the node 'does not exist' if nbCommonOrg == len(samp): part["exact_core"] +=1 else: part["exact_accessory"] +=1 if float(nbCommonOrg) >= len(samp) * soft_core: part["soft_core"] +=1 else: part["soft_accessory"] +=1 bar.update() SampNbPerPart.append(part) bar.close() #done with frequency of each family for each sample. global samples samples = AllSamples args = [] for index, samp in enumerate(samples): args.append((index, tmpdir, beta, sm_degree, free_dispersion, chunk_size, K, krange, seed)) with Pool(processes = cpu) as p: #launch partitionnings logging.getLogger().info("Partitionning all samples...") bar = tqdm(range(len(args)), unit = "samples partitionned", disable=not show_bar) random.shuffle(args)#shuffling the processing so that the progress bar is closer to reality. for result in p.imap_unordered(launch_raref_nem, args): SampNbPerPart[result[1]] = {**result[0], **SampNbPerPart[result[1]]} bar.update() bar.close() logging.getLogger().info("Done partitionning everything") warnings.filterwarnings("ignore") drawCurve(output, maxSampling, SampNbPerPart ) warnings.resetwarnings() tmpdirObj.cleanup() logging.getLogger().info("Done making the rarefaction curves")
def writeFlatFiles(pangenome, output, cpu=1, soft_core=0.95, dup_margin=0.05, csv=False, genePA=False, gexf=False, light_gexf=False, projection=False, stats=False, json=False, partitions=False, families_tsv=False, all_genes=False, all_prot_families=False, all_gene_families=False, compress=False): global pan pan = pangenome processes = [] if any(x for x in [ csv, genePA, gexf, light_gexf, projection, stats, json, partitions, families_tsv, all_genes, all_prot_families, all_gene_families ]): #then it's useful to load the pangenome. checkPangenomeInfo(pan, needAnnotations=True, needFamilies=True, needGraph=True) ex_partitionned = Exception( "The provided pangenome has not been partitionned. This is not compatible with any of the following options : --light_gexf, --gexf, --csv, --partitions" ) ex_genesClustered = Exception( "The provided pangenome has not gene families. This is not compatible with any of the following options : --families_tsv --all_prot_families --all_gene_families" ) ex_genomesAnnotated = Exception( "The provided pangenome has no annotated sequences. This is not compatible with any of the following options : --all_genes" ) ex_geneSequences = Exception( "The provided pangenome has no gene sequences. This is not compatible with any of the following options : --all_genes, --all_gene_families" ) ex_geneFamilySequences = Exception( "The provided pangenome has no gene families. This is not compatible with any of the following options : --all_prot_families, all_gene_families" ) if not pan.status["partitionned"] in ["Loaded", "Computed"] and ( light_gexf or gexf or csv or projection or partitions ): #could allow to write the csv or genePA without partition... raise ex_partitionned if not pan.status["genesClustered"] in ["Loaded", "Computed" ] and (families_tsv): raise ex_genesClustered if not pan.status["genomesAnnotated"] in ["Loaded", "Computed" ] and (all_genes): raise ex_genomesAnnotated if not pan.status["geneSequences"] in ["inFile"] and ( all_genes or all_gene_families): raise ex_geneSequences if not pan.status["geneFamilySequences"] in ["Loaded", "Computed" ] and (all_prot_families): raise ex_geneFamilySequences pan.getIndex() #make the index because it will be used most likely with Pool(processes=cpu) as p: if csv: processes.append( p.apply_async(func=writeMatrix, args=(',', "csv", output, compress, True))) if genePA: processes.append( p.apply_async(func=writeGenePresenceAbsence, args=(output, compress))) if gexf: processes.append( p.apply_async(func=writeGEXF, args=(output, False, soft_core, compress))) if light_gexf: processes.append( p.apply_async(func=writeGEXF, args=(output, True, soft_core, compress))) if projection: processes.append( p.apply_async(func=writeProjections, args=(output, compress))) if stats: processes.append( p.apply_async(func=writeStats, args=(output, soft_core, dup_margin, compress))) if json: processes.append( p.apply_async(func=writeJSON, args=(output, compress))) if partitions: processes.append( p.apply_async(func=writeParts, args=(output, soft_core, compress))) if families_tsv: processes.append( p.apply_async(func=writeGeneFamiliesTSV, args=(output, compress))) if all_genes: processes.append( p.apply_async(func=writeGeneSequences, args=(output, compress))) if all_prot_families: processes.append( p.apply_async(func=writeFastaProtFam, args=(output, compress))) if all_gene_families: processes.append( p.apply_async(func=writeFastaGenFam, args=(output, compress))) for process in processes: process.get() #get all the results
def align(pangenome, sequenceFile, output, tmpdir, identity=0.8, coverage=0.8, no_defrag=False, cpu=1, getinfo=False, draw_related=False, disable_bar=False): if pangenome.status["geneFamilySequences"] not in [ "inFile", "Loaded", "Computed" ]: raise Exception( "Cannot use this function as your pangenome does not have gene families representatives " "associated to it. For now this works only if the clustering is realised by PPanGGOLiN." ) # could be possible either by picking a representative somehow, or by aligning on genes rather than on # families, if they are in the pangenome. if getinfo or draw_related: need_mod = False if pangenome.status["modules"] != "No": # modules are not required to be loaded, but if they have been computed we load them. need_mod = True checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needPartitions=True, needRGP=True, needSpots=True, needModules=need_mod, disable_bar=disable_bar) else: checkPangenomeInfo(pangenome, needFamilies=True, disable_bar=disable_bar) new_tmpdir = tempfile.TemporaryDirectory(dir=tmpdir) seqSet, alignFile, seq2pang = get_seq2pang(pangenome, sequenceFile, output, new_tmpdir, cpu, no_defrag, identity, coverage) if getinfo or draw_related: getSeqInfo(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) partProj = projectPartition(seq2pang, seqSet, output) # write the partition assignation only logging.getLogger().info(f"sequences partition projection : '{partProj}'") logging.getLogger().info( f"{len(seq2pang)} sequences over {len(seqSet)} have at least one hit in the pangenome." ) logging.getLogger().info( f"Blast-tab file of the alignment : '{alignFile}'") new_tmpdir.cleanup()
def writeSequenceFiles(pangenome, output, fasta=None, anno=None, cpu=1, regions=None, genes=None, gene_families=None, prot_families=None, compress=False, show_bar=True): if not any(x for x in [regions, genes, prot_families, gene_families]): raise Exception("You did not indicate what file you wanted to write.") needAnnotations = False needFamilies = False needGraph = False needPartitions = False needSpots = False needRegions = False if any(x is not None for x in [regions, genes, gene_families, prot_families]): needAnnotations = True if regions is not None or any( x == "rgp" for x in (genes, gene_families, prot_families)): needRegions = True if regions is not None or gene_families is not None or prot_families is not None or genes in [ "persistent", "shell", "cloud" ]: needFamilies = True if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)): needPartitions = True #need to deal with sequence-related flags outside of checkPangenomeInfo since ex_geneSequences = Exception( "The provided pangenome has no gene sequences. This is not compatible with any of the following options : --genes, --gene_families" ) ex_geneFamilySequences = Exception( "The provided pangenome has no gene families. This is not compatible with any of the following options : --prot_families, --gene_families" ) if not pangenome.status["geneSequences"] in ["inFile"] and (genes or gene_families): raise ex_geneSequences if not pangenome.status["geneFamilySequences"] in [ "Loaded", "Computed", "inFile" ] and prot_families: raise ex_geneFamilySequences checkPangenomeInfo(pangenome, needAnnotations=needAnnotations, needFamilies=needFamilies, needGraph=needGraph, needPartitions=needPartitions, needRGP=needRegions, needSpots=needSpots) if prot_families is not None: writeFastaProtFam(pangenome, output, compress, prot_families, show_bar=show_bar) if gene_families is not None: writeFastaGeneFam(pangenome, output, compress, gene_families, show_bar=show_bar) if genes is not None: writeGeneSequences(pangenome, output, compress, genes, show_bar=show_bar) if regions is not None: writeRegionsSequences(pangenome, output, compress, regions, fasta, anno, show_bar=show_bar)
def readClustering(pangenome, families_tsv_file, infer_singletons=False, force=False, disable_bar=False): """ Creates the pangenome, the gene families and the genes with an associated gene family. Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome. """ checkPangenomeFormerClustering(pangenome, force) checkPangenomeInfo(pangenome, needAnnotations=True, disable_bar=disable_bar) logging.getLogger().info("Reading " + families_tsv_file + " the gene families file ...") filesize = os.stat(families_tsv_file).st_size families_tsv_file = read_compressed_or_not(families_tsv_file) frag = False # the genome annotations are necessarily loaded. nbGeneWithFam = 0 localDict = mkLocal2Gene(pangenome) bar = tqdm(total=filesize, unit="bytes", disable=disable_bar) lineCounter = 0 for line in families_tsv_file: lineCounter += 1 bar.update(len(line)) try: elements = [el.strip() for el in line.split()] # 2 or 3 fields expected if len(elements) <= 1: raise ValueError( "No tabulation separator found in gene families file") (fam_id, gene_id, is_frag) = elements if len(elements) == 3 else elements + [None] try: geneObj = pangenome.getGene(gene_id) except KeyError: geneObj = localDict.get(gene_id) if geneObj is not None: nbGeneWithFam += 1 fam = pangenome.addGeneFamily(fam_id) geneObj.is_fragment = True if is_frag == "F" else False fam.addGene(geneObj) if is_frag == "F": frag = True except: raise Exception( f"line {lineCounter} of the file '{families_tsv_file.name}' raised an error." ) bar.close() families_tsv_file.close() if nbGeneWithFam < len( pangenome.genes): # not all genes have an associated cluster if nbGeneWithFam == 0: raise Exception( "No gene ID in the cluster file matched any gene ID from the annotation step." " Please ensure that the annotations that you loaded previously and the clustering results " "that you have used the same gene IDs. If you use .gff files it is the identifier stored in" " the field 'ID'. If you use .gbff files it is the identifier stored in 'locus_tag'." ) else: if infer_singletons: inferSingletons(pangenome) else: raise Exception( f"Some genes ({len(pangenome.genes) - nbGeneWithFam}) did not have an associated " f"cluster. Either change your cluster file so that each gene has a cluster, " f"or use the --infer_singletons option to infer a cluster for each non-clustered gene." ) pangenome.status["genesClustered"] = "Computed" if frag: # if there was fragment information in the file. pangenome.status["defragmented"] = "Computed" pangenome.parameters["cluster"] = {} pangenome.parameters["cluster"]["read_clustering_from_file"] = True pangenome.parameters["cluster"]["infer_singletons"] = infer_singletons
def writeFlatFiles(pangenome, output, cpu=1, soft_core=0.95, dup_margin=0.05, csv=False, genePA=False, gexf=False, light_gexf=False, projection=False, stats=False, json=False, partitions=False, regions=False, families_tsv=False, all_genes=False, all_prot_families=False, all_gene_families=False, spots=False, borders=False, compress=False): if not any(x for x in [ csv, genePA, gexf, light_gexf, projection, stats, json, partitions, regions, spots, borders, families_tsv, all_genes, all_prot_families, all_gene_families ]): raise Exception("You did not indicate what file you wanted to write.") global pan pan = pangenome processes = [] needAnnotations = False needFamilies = False needGraph = False needPartitions = False needSpots = False needRegions = False if csv or genePA or gexf or light_gexf or projection or stats or json or partitions or regions or spots or families_tsv or borders: needAnnotations = True if csv or genePA or gexf or light_gexf or projection or stats or json or partitions or regions or spots or families_tsv or all_prot_families or all_gene_families or borders: needFamilies = True if projection or stats or partitions or regions or spots or borders: needPartitions = True if gexf or light_gexf or json: needGraph = True if regions or spots or borders: needRegions = True if spots or borders: needSpots = True #need to deal with sequence-related flags outside of checkPangenomeInfo since ex_geneSequences = Exception( "The provided pangenome has no gene sequences. This is not compatible with any of the following options : --all_genes, --all_gene_families" ) ex_geneFamilySequences = Exception( "The provided pangenome has no gene families. This is not compatible with any of the following options : --all_prot_families, all_gene_families" ) if not pan.status["geneSequences"] in ["inFile"] and (all_genes or all_gene_families): raise ex_geneSequences if not pan.status["geneFamilySequences"] in [ "Loaded", "Computed", "inFile" ] and (all_prot_families): raise ex_geneFamilySequences checkPangenomeInfo(pan, needAnnotations=needAnnotations, needFamilies=needFamilies, needGraph=needGraph, needPartitions=needPartitions, needRGP=needRegions, needSpots=needSpots) pan.getIndex() #make the index because it will be used most likely with Pool(processes=cpu) as p: if csv: processes.append( p.apply_async(func=writeMatrix, args=(',', "csv", output, compress, True))) if genePA: processes.append( p.apply_async(func=writeGenePresenceAbsence, args=(output, compress))) if gexf: processes.append( p.apply_async(func=writeGEXF, args=(output, False, soft_core, compress))) if light_gexf: processes.append( p.apply_async(func=writeGEXF, args=(output, True, soft_core, compress))) if projection: processes.append( p.apply_async(func=writeProjections, args=(output, compress))) if stats: processes.append( p.apply_async(func=writeStats, args=(output, soft_core, dup_margin, compress))) if json: processes.append( p.apply_async(func=writeJSON, args=(output, compress))) if partitions: processes.append( p.apply_async(func=writeParts, args=(output, soft_core, compress))) if families_tsv: processes.append( p.apply_async(func=writeGeneFamiliesTSV, args=(output, compress))) if all_genes: processes.append( p.apply_async(func=writeGeneSequences, args=(output, compress))) if all_prot_families: processes.append( p.apply_async(func=writeFastaProtFam, args=(output, compress))) if all_gene_families: processes.append( p.apply_async(func=writeFastaGenFam, args=(output, compress))) if regions: processes.append( p.apply_async(func=writeRegions, args=(output, compress))) if spots: processes.append( p.apply_async(func=writeSpots, args=(output, compress))) if borders: processes.append( p.apply_async(func=writeBorders, args=(output, dup_margin, compress))) for process in processes: process.get() #get all the results