def launch(args): mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) if args.proteins is not None: align(pangenome=pangenome, proteinFile=args.proteins, output=args.output, tmpdir=args.tmpdir, identity=args.identity, coverage=args.coverage, defrag=args.defrag, cpu=args.cpu, getinfo=args.getinfo, draw_related=args.draw_related) if args.annotation is not None: projectRGP(pangenome, args.annotation, args.output, args.tmpdir, args.identity, args.coverage, args.defrag, args.cpu, args.translation_table, pseudo=args.use_pseudo)
def launch(args): mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) writeFlatFiles(pangenome, args.output, cpu=args.cpu, soft_core=args.soft_core, dup_margin=args.dup_margin, csv=args.csv, genePA=args.Rtab, gexf=args.gexf, light_gexf=args.light_gexf, projection=args.projection, stats=args.stats, json=args.json, partitions=args.partitions, regions=args.regions, families_tsv=args.families_tsv, all_genes=args.all_genes, all_prot_families=args.all_prot_families, all_gene_families=args.all_gene_families, spots=args.spots, borders=args.borders, compress=args.compress)
def launch(args): """ main code when launch partition from the command line. """ if args.draw_ICL or args.keep_tmp_files: mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) partition(pangenome, args.tmpdir, args.output, args.force, args.beta, args.max_degree_smoothing, args.free_dispersion, args.chunk_size, args.nb_of_partitions, args.krange, args.ICL_margin, args.draw_ICL, args.cpu, args.seed, args.keep_tmp_files, show_bar=args.show_prog_bars) writePangenome(pangenome, pangenome.file, args.force, show_bar=args.show_prog_bars)
def launch(args): pangenome = Pangenome() pangenome.addFile(args.pangenome) if args.spot_graph or args.draw_hotspots: mkOutdir(args.output, args.force) predictHotspots(pangenome, args.output, force=args.force, cpu = args.cpu, spot_graph=args.spot_graph, overlapping_match=args.overlapping_match, set_size=args.set_size, exact_match=args.exact_match_size, draw_hotspot=args.draw_hotspots, interest=args.interest) writePangenome(pangenome, pangenome.file, args.force)
def launch(args): mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) if args.tile_plot: drawTilePlot(pangenome, args.output, args.nocloud) if args.ucurve: drawUCurve(pangenome, args.output, soft_core = args.soft_core)
def launchSequences(args): checkOptions(args) mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) writeSequenceFiles(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core, regions=args.regions, genes=args.genes, gene_families=args.gene_families, prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar)
def launch(args): mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) writeFlatFiles(pangenome, args.output, args.cpu, args.soft_core, args.dup_margin, args.csv, args.Rtab, args.gexf, args.light_gexf, args.projection, args.stats, args.json, args.partitions, args.families_tsv, args.all_genes, args.all_prot_families, args.all_gene_families, args.compress)
def launchMSA(args): mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) writeMSAFiles(pangenome, args.output, cpu=args.cpu, partition=args.partition, tmpdir=args.tmpdir, source=args.source, force=args.force, show_bar=args.show_prog_bars)
def launch(args): pangenome = Pangenome() pangenome.addFile(args.pangenome) if args.spot_graph: mkOutdir(args.output, args.force) if args.draw_hotspots or args.interest or args.fig_margin or args.priority: logging.getLogger().warning( "Options to draw the spots with the 'ppanggolin spot' subcommand have been deprecated, " "and are now dealt with in a dedicated subcommand 'ppanggolin drawspot'.") predictHotspots(pangenome, args.output, force=args.force, cpu=args.cpu, spot_graph=args.spot_graph, overlapping_match=args.overlapping_match, set_size=args.set_size, exact_match=args.exact_match_size, disable_bar=args.disable_prog_bar) writePangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar)
def launchSequences(args): mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) checkOptions(args) writeSequenceFiles(pangenome, args.output, fasta=args.fasta, anno=args.anno, cpu=args.cpu, regions=args.regions, genes=args.genes, prot_families=args.prot_families, gene_families=args.gene_families, compress=args.compress, show_bar=args.show_prog_bars)
def launch(args): mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) if args.tile_plot: drawTilePlot(pangenome, args.output, args.nocloud, disable_bar=args.disable_prog_bar) if args.ucurve: drawUCurve(pangenome, args.output, soft_core=args.soft_core, disable_bar=args.disable_prog_bar) if args.spots != '': drawSpots(pangenome=pangenome, output=args.output, spot_list=args.spots, disable_bar=args.disable_prog_bar)
def launch(args): mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) if args.interest or args.fig_margin or args.label_priority: logging.getLogger().warning( "Options --interest, --fig_margin and --label_priority are deprecated, " "and the actions they defined are now doable directly in the interactive figures " "that are drawn") align(pangenome=pangenome, sequenceFile=args.sequences, output=args.output, tmpdir=args.tmpdir, cpu=args.cpu, identity=args.identity, coverage=args.coverage, no_defrag=args.no_defrag, getinfo=args.getinfo, draw_related=args.draw_related, disable_bar=args.disable_prog_bar)
def launch(args): """ main code when launch partition from the command line. """ mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) makeRarefactionCurve( pangenome = pangenome, output = args.output, tmpdir = args.tmpdir, beta =args.beta, depth = args.depth, minSampling=args.min, maxSampling=args.max, sm_degree=args.max_degree_smoothing, free_dispersion=args.free_dispersion, chunk_size=args.chunk_size, K=args.nb_of_partitions, cpu = args.cpu, seed = args.seed, kestimate=args.reestimate_K, krange = args.krange, soft_core = args.soft_core)
def writeMSAFiles(pangenome, output, cpu=1, partition="core", tmpdir="/tmp", source="protein", force=False, show_bar=True): needPartitions = False if partition in ["persistent", "shell", "cloud"]: needPartitions = True outname = output + f"/msa_{partition}_{source}/" mkOutdir(outname, force=force) checkPangenomeInfo(pangenome, needAnnotations=True, needFamilies=True, needPartitions=needPartitions, needGeneSequences=True, show_bar=show_bar) logging.getLogger().info(f"computing msa for {partition} families...") families = getFamiliesToWrite(pangenome, partitionFilter=partition) #this must exist since we loaded the pangenome and families are required code = pangenome.parameters["cluster"]["translation_table"] computeMSA(families, outname, cpu=cpu, tmpdir=tmpdir, source=source, code=code, show_bar=show_bar) logging.getLogger().info(f"Done writing all {partition} MSA in: {outname}")
def write_nem_input_files(tmpdir, organisms, sm_degree): mkOutdir(tmpdir, force=False) total_edges_weight = 0 with open(tmpdir + "/column_org_file", "w") as org_file: org_file.write(" ".join([f'"{org.name}"' for org in organisms]) + "\n") logging.getLogger().debug( "Writing nem_file.str nem_file.index nem_file.nei and nem_file.dat files" ) with open(tmpdir+"/nem_file.str", "w") as str_file,\ open(tmpdir+"/nem_file.index", "w") as index_file,\ open(tmpdir+"/nem_file.nei", "w") as nei_file,\ open(tmpdir+"/nem_file.dat", "w") as dat_file: nei_file.write("1\n") index_fam = {} index_org = {} default_dat = [] for index, org in enumerate(organisms): default_dat.append('0') index_org[org] = index for fam in pan.geneFamilies: #could use bitarrays if this part is limiting? if not organisms.isdisjoint(fam.organisms): currDat = list(default_dat) curr_orgs = fam.organisms & organisms for org in curr_orgs: currDat[index_org[org]] = "1" dat_file.write("\t".join(currDat) + "\n") index_fam[fam] = len(index_fam) + 1 index_file.write(f"{len(index_fam)}\t{fam.name}\n") for fam in index_fam.keys(): row_fam = [] row_dist_score = [] neighbor_number = 0 sum_dist_score = 0 for edge in fam.edges: #iter on the family's edges. coverage = sum([ len(gene_list) for org, gene_list in edge.organisms.items() if org in organisms ]) if coverage == 0: continue #nothing interesting to write, this edge does not exist with this subset of organisms. distance_score = coverage / len(organisms) sum_dist_score += distance_score row_fam.append( str(index_fam[edge.target if fam == edge.source else edge.source])) row_dist_score.append(str(round(distance_score, 4))) neighbor_number += 1 if neighbor_number > 0 and float(neighbor_number) < sm_degree: total_edges_weight += sum_dist_score nei_file.write('\t'.join([ str(item) for sublist in [[index_fam[fam]], [neighbor_number], row_fam, row_dist_score] for item in sublist ]) + "\n") else: nei_file.write(str(index_fam[fam]) + "\t0\n") str_file.write("S\t" + str(len(index_fam)) + "\t" + str(len(organisms)) + "\n") return total_edges_weight / 2, len(index_fam)
def launch(args): check_option_workflow(args) pangenome = Pangenome() filename = mkFilename(args.basename, args.output, args.force) writing_time, anno_time, clust_time, mod_time, desc_time = (None, None, None, None, None) if args.anno: # if the annotations are provided, we read from it start_anno = time.time() readAnnotations(pangenome, args.anno, cpu=args.cpu, disable_bar=args.disable_prog_bar) anno_time = time.time() - start_anno start_writing = time.time() writePangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) writing_time = time.time() - start_writing if args.clusters is None and pangenome.status[ "geneSequences"] == "No" and args.fasta is None: raise Exception( "The gff/gbff provided did not have any sequence informations, " "you did not provide clusters and you did not provide fasta file. " "Thus, we do not have the information we need to continue the analysis." ) elif args.clusters is None and pangenome.status[ "geneSequences"] == "No" and args.fasta is not None: getGeneSequencesFromFastas(pangenome, args.fasta) start_clust = time.time() if args.clusters is not None: readClustering(pangenome, args.clusters, disable_bar=args.disable_prog_bar) elif args.clusters is None: # we should have the sequences here. clustering(pangenome, args.tmpdir, args.cpu, identity=args.identity, coverage=args.coverage, mode=args.mode, defrag=not args.no_defrag, disable_bar=args.disable_prog_bar) clust_time = time.time() - start_clust elif args.fasta is not None: start_anno = time.time() annotatePangenome(pangenome, args.fasta, args.tmpdir, args.cpu, contig_filter=args.contig_filter, disable_bar=args.disable_prog_bar) anno_time = time.time() - start_anno start_writing = time.time() writePangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) writing_time = time.time() - start_writing start_clust = time.time() clustering(pangenome, args.tmpdir, args.cpu, identity=args.identity, coverage=args.coverage, mode=args.mode, defrag=not args.no_defrag, disable_bar=args.disable_prog_bar) clust_time = time.time() - start_clust writePangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) start_graph = time.time() computeNeighborsGraph(pangenome, disable_bar=args.disable_prog_bar) graph_time = time.time() - start_graph start_part = time.time() partition(pangenome, tmpdir=args.tmpdir, cpu=args.cpu, K=args.nb_of_partitions, disable_bar=args.disable_prog_bar) part_time = time.time() - start_part start_writing = time.time() writePangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) writing_time = writing_time + time.time() - start_writing start_regions = time.time() predictRGP(pangenome, disable_bar=args.disable_prog_bar) regions_time = time.time() - start_regions start_spots = time.time() predictHotspots(pangenome, args.output, disable_bar=args.disable_prog_bar) spot_time = time.time() - start_spots start_mods = time.time() predictModules(pangenome=pangenome, cpu=args.cpu, tmpdir=args.tmpdir, disable_bar=args.disable_prog_bar) mod_time = time.time() - start_mods start_writing = time.time() writePangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) writing_time = writing_time + time.time() - start_writing if not args.only_pangenome: start_spot_drawing = time.time() mkOutdir(args.output + '/spot_figures', force=True) drawSpots(pangenome=pangenome, output=args.output + '/spot_figures', spot_list='all', disable_bar=args.disable_prog_bar) spot_time = spot_time + time.time() - start_spot_drawing if args.rarefaction: makeRarefactionCurve(pangenome, args.output, args.tmpdir, cpu=args.cpu, disable_bar=args.disable_prog_bar) if 1 < len(pangenome.organisms) < 5000: drawTilePlot( pangenome, args.output, nocloud=False if len(pangenome.organisms) < 500 else True) drawUCurve(pangenome, args.output) start_desc = time.time() writeFlatFiles(pangenome, args.output, args.cpu, csv=True, genePA=True, gexf=True, light_gexf=True, projection=True, json=True, stats=True, partitions=True, regions=True, spots=True, borders=True, spot_modules=True, modules=True) desc_time = time.time() - start_desc logging.getLogger().info( f"Annotation took : {round(anno_time, 2)} seconds") logging.getLogger().info( f"Clustering took : {round(clust_time, 2)} seconds") logging.getLogger().info( f"Building the graph took : {round(graph_time, 2)} seconds") logging.getLogger().info( f"Partitioning the pangenome took : {round(part_time, 2)} seconds") logging.getLogger().info( f"Predicting RGP took : {round(regions_time, 2)} seconds") logging.getLogger().info( f"Gathering RGP into spots took : {round(spot_time, 2)} seconds") logging.getLogger().info( f"Predicting modules took : {round(mod_time, 2)} seconds") logging.getLogger().info( f"Writing the pangenome data in HDF5 took : {round(writing_time, 2)} seconds" ) if not args.only_pangenome: logging.getLogger().info( f"Writing descriptive files for the pangenome took : {round(desc_time, 2)} seconds" ) printInfo(filename, content=True)
def launch(args): mkOutdir(args.output, args.force) pangenome = Pangenome() pangenome.addFile(args.pangenome) align(pangenome, args.proteins, args.output, args.tmpdir, args.identity, args.coverage, args.defrag, args.cpu)