Exemple #1
0
def launch(args):
    mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    if args.proteins is not None:
        align(pangenome=pangenome,
              proteinFile=args.proteins,
              output=args.output,
              tmpdir=args.tmpdir,
              identity=args.identity,
              coverage=args.coverage,
              defrag=args.defrag,
              cpu=args.cpu,
              getinfo=args.getinfo,
              draw_related=args.draw_related)

    if args.annotation is not None:
        projectRGP(pangenome,
                   args.annotation,
                   args.output,
                   args.tmpdir,
                   args.identity,
                   args.coverage,
                   args.defrag,
                   args.cpu,
                   args.translation_table,
                   pseudo=args.use_pseudo)
Exemple #2
0
def launch(args):
    mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    writeFlatFiles(pangenome,
                   args.output,
                   cpu=args.cpu,
                   soft_core=args.soft_core,
                   dup_margin=args.dup_margin,
                   csv=args.csv,
                   genePA=args.Rtab,
                   gexf=args.gexf,
                   light_gexf=args.light_gexf,
                   projection=args.projection,
                   stats=args.stats,
                   json=args.json,
                   partitions=args.partitions,
                   regions=args.regions,
                   families_tsv=args.families_tsv,
                   all_genes=args.all_genes,
                   all_prot_families=args.all_prot_families,
                   all_gene_families=args.all_gene_families,
                   spots=args.spots,
                   borders=args.borders,
                   compress=args.compress)
Exemple #3
0
def launch(args):
    """
        main code when launch partition from the command line.
    """
    if args.draw_ICL or args.keep_tmp_files:
        mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    partition(pangenome,
              args.tmpdir,
              args.output,
              args.force,
              args.beta,
              args.max_degree_smoothing,
              args.free_dispersion,
              args.chunk_size,
              args.nb_of_partitions,
              args.krange,
              args.ICL_margin,
              args.draw_ICL,
              args.cpu,
              args.seed,
              args.keep_tmp_files,
              show_bar=args.show_prog_bars)
    writePangenome(pangenome,
                   pangenome.file,
                   args.force,
                   show_bar=args.show_prog_bars)
Exemple #4
0
def launch(args):
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    if args.spot_graph or args.draw_hotspots:
        mkOutdir(args.output, args.force)
    predictHotspots(pangenome, args.output, force=args.force, cpu = args.cpu, spot_graph=args.spot_graph, overlapping_match=args.overlapping_match, set_size=args.set_size, exact_match=args.exact_match_size, draw_hotspot=args.draw_hotspots, interest=args.interest)
    writePangenome(pangenome, pangenome.file, args.force)
Exemple #5
0
def launch(args):
    mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    if args.tile_plot:
        drawTilePlot(pangenome, args.output, args.nocloud)
    if args.ucurve:
        drawUCurve(pangenome, args.output, soft_core = args.soft_core)
Exemple #6
0
def launchSequences(args):
    checkOptions(args)
    mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    writeSequenceFiles(pangenome, args.output, fasta=args.fasta, anno=args.anno, soft_core=args.soft_core,
                       regions=args.regions, genes=args.genes, gene_families=args.gene_families,
                       prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar)
Exemple #7
0
def launch(args):
    mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    writeFlatFiles(pangenome, args.output, args.cpu, args.soft_core,
                   args.dup_margin, args.csv, args.Rtab, args.gexf,
                   args.light_gexf, args.projection, args.stats, args.json,
                   args.partitions, args.families_tsv, args.all_genes,
                   args.all_prot_families, args.all_gene_families,
                   args.compress)
Exemple #8
0
def launchMSA(args):
    mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    writeMSAFiles(pangenome,
                  args.output,
                  cpu=args.cpu,
                  partition=args.partition,
                  tmpdir=args.tmpdir,
                  source=args.source,
                  force=args.force,
                  show_bar=args.show_prog_bars)
Exemple #9
0
def launch(args):
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    if args.spot_graph:
        mkOutdir(args.output, args.force)
    if args.draw_hotspots or args.interest or args.fig_margin or args.priority:
        logging.getLogger().warning(
            "Options to draw the spots with the 'ppanggolin spot' subcommand have been deprecated, "
            "and are now dealt with in a dedicated subcommand 'ppanggolin drawspot'.")
    predictHotspots(pangenome, args.output, force=args.force, cpu=args.cpu, spot_graph=args.spot_graph,
                    overlapping_match=args.overlapping_match, set_size=args.set_size, exact_match=args.exact_match_size,
                    disable_bar=args.disable_prog_bar)
    writePangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar)
Exemple #10
0
def launchSequences(args):
    mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    checkOptions(args)
    writeSequenceFiles(pangenome,
                       args.output,
                       fasta=args.fasta,
                       anno=args.anno,
                       cpu=args.cpu,
                       regions=args.regions,
                       genes=args.genes,
                       prot_families=args.prot_families,
                       gene_families=args.gene_families,
                       compress=args.compress,
                       show_bar=args.show_prog_bars)
Exemple #11
0
def launch(args):
    mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    if args.tile_plot:
        drawTilePlot(pangenome,
                     args.output,
                     args.nocloud,
                     disable_bar=args.disable_prog_bar)
    if args.ucurve:
        drawUCurve(pangenome,
                   args.output,
                   soft_core=args.soft_core,
                   disable_bar=args.disable_prog_bar)
    if args.spots != '':
        drawSpots(pangenome=pangenome,
                  output=args.output,
                  spot_list=args.spots,
                  disable_bar=args.disable_prog_bar)
Exemple #12
0
def launch(args):
    mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    if args.interest or args.fig_margin or args.label_priority:
        logging.getLogger().warning(
            "Options --interest, --fig_margin and --label_priority are deprecated, "
            "and the actions they defined are now doable directly in the interactive figures "
            "that are drawn")
    align(pangenome=pangenome,
          sequenceFile=args.sequences,
          output=args.output,
          tmpdir=args.tmpdir,
          cpu=args.cpu,
          identity=args.identity,
          coverage=args.coverage,
          no_defrag=args.no_defrag,
          getinfo=args.getinfo,
          draw_related=args.draw_related,
          disable_bar=args.disable_prog_bar)
Exemple #13
0
def launch(args):
    """
        main code when launch partition from the command line.
    """
    mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    makeRarefactionCurve( pangenome = pangenome,
                        output = args.output,
                        tmpdir = args.tmpdir,
                        beta =args.beta,
                        depth = args.depth,
                        minSampling=args.min,
                        maxSampling=args.max,
                        sm_degree=args.max_degree_smoothing,
                        free_dispersion=args.free_dispersion,
                        chunk_size=args.chunk_size,
                        K=args.nb_of_partitions,
                        cpu = args.cpu,
                        seed = args.seed,
                        kestimate=args.reestimate_K,
                        krange = args.krange,
                        soft_core = args.soft_core)
Exemple #14
0
def writeMSAFiles(pangenome,
                  output,
                  cpu=1,
                  partition="core",
                  tmpdir="/tmp",
                  source="protein",
                  force=False,
                  show_bar=True):

    needPartitions = False
    if partition in ["persistent", "shell", "cloud"]:
        needPartitions = True

    outname = output + f"/msa_{partition}_{source}/"
    mkOutdir(outname, force=force)

    checkPangenomeInfo(pangenome,
                       needAnnotations=True,
                       needFamilies=True,
                       needPartitions=needPartitions,
                       needGeneSequences=True,
                       show_bar=show_bar)
    logging.getLogger().info(f"computing msa for {partition} families...")
    families = getFamiliesToWrite(pangenome, partitionFilter=partition)

    #this must exist since we loaded the pangenome and families are required
    code = pangenome.parameters["cluster"]["translation_table"]

    computeMSA(families,
               outname,
               cpu=cpu,
               tmpdir=tmpdir,
               source=source,
               code=code,
               show_bar=show_bar)
    logging.getLogger().info(f"Done writing all {partition} MSA in: {outname}")
Exemple #15
0
def write_nem_input_files(tmpdir, organisms, sm_degree):

    mkOutdir(tmpdir, force=False)
    total_edges_weight = 0

    with open(tmpdir + "/column_org_file", "w") as org_file:
        org_file.write(" ".join([f'"{org.name}"' for org in organisms]) + "\n")

    logging.getLogger().debug(
        "Writing nem_file.str nem_file.index nem_file.nei and nem_file.dat files"
    )
    with open(tmpdir+"/nem_file.str", "w") as str_file,\
        open(tmpdir+"/nem_file.index", "w") as index_file,\
        open(tmpdir+"/nem_file.nei", "w") as nei_file,\
        open(tmpdir+"/nem_file.dat", "w") as dat_file:

        nei_file.write("1\n")
        index_fam = {}

        index_org = {}
        default_dat = []
        for index, org in enumerate(organisms):
            default_dat.append('0')
            index_org[org] = index
        for fam in pan.geneFamilies:
            #could use bitarrays if this part is limiting?
            if not organisms.isdisjoint(fam.organisms):
                currDat = list(default_dat)
                curr_orgs = fam.organisms & organisms
                for org in curr_orgs:
                    currDat[index_org[org]] = "1"
                dat_file.write("\t".join(currDat) + "\n")
                index_fam[fam] = len(index_fam) + 1
                index_file.write(f"{len(index_fam)}\t{fam.name}\n")

        for fam in index_fam.keys():
            row_fam = []
            row_dist_score = []
            neighbor_number = 0
            sum_dist_score = 0
            for edge in fam.edges:  #iter on the family's edges.
                coverage = sum([
                    len(gene_list)
                    for org, gene_list in edge.organisms.items()
                    if org in organisms
                ])
                if coverage == 0:
                    continue  #nothing interesting to write, this edge does not exist with this subset of organisms.
                distance_score = coverage / len(organisms)
                sum_dist_score += distance_score
                row_fam.append(
                    str(index_fam[edge.target if fam ==
                                  edge.source else edge.source]))
                row_dist_score.append(str(round(distance_score, 4)))
                neighbor_number += 1
            if neighbor_number > 0 and float(neighbor_number) < sm_degree:
                total_edges_weight += sum_dist_score
                nei_file.write('\t'.join([
                    str(item)
                    for sublist in [[index_fam[fam]], [neighbor_number],
                                    row_fam, row_dist_score]
                    for item in sublist
                ]) + "\n")
            else:
                nei_file.write(str(index_fam[fam]) + "\t0\n")

        str_file.write("S\t" + str(len(index_fam)) + "\t" +
                       str(len(organisms)) + "\n")
    return total_edges_weight / 2, len(index_fam)
Exemple #16
0
def launch(args):
    check_option_workflow(args)
    pangenome = Pangenome()
    filename = mkFilename(args.basename, args.output, args.force)
    writing_time, anno_time, clust_time, mod_time, desc_time = (None, None,
                                                                None, None,
                                                                None)
    if args.anno:  # if the annotations are provided, we read from it
        start_anno = time.time()
        readAnnotations(pangenome,
                        args.anno,
                        cpu=args.cpu,
                        disable_bar=args.disable_prog_bar)
        anno_time = time.time() - start_anno
        start_writing = time.time()
        writePangenome(pangenome,
                       filename,
                       args.force,
                       disable_bar=args.disable_prog_bar)
        writing_time = time.time() - start_writing
        if args.clusters is None and pangenome.status[
                "geneSequences"] == "No" and args.fasta is None:
            raise Exception(
                "The gff/gbff provided did not have any sequence informations, "
                "you did not provide clusters and you did not provide fasta file. "
                "Thus, we do not have the information we need to continue the analysis."
            )

        elif args.clusters is None and pangenome.status[
                "geneSequences"] == "No" and args.fasta is not None:
            getGeneSequencesFromFastas(pangenome, args.fasta)
        start_clust = time.time()
        if args.clusters is not None:
            readClustering(pangenome,
                           args.clusters,
                           disable_bar=args.disable_prog_bar)

        elif args.clusters is None:  # we should have the sequences here.
            clustering(pangenome,
                       args.tmpdir,
                       args.cpu,
                       identity=args.identity,
                       coverage=args.coverage,
                       mode=args.mode,
                       defrag=not args.no_defrag,
                       disable_bar=args.disable_prog_bar)
        clust_time = time.time() - start_clust
    elif args.fasta is not None:
        start_anno = time.time()
        annotatePangenome(pangenome,
                          args.fasta,
                          args.tmpdir,
                          args.cpu,
                          contig_filter=args.contig_filter,
                          disable_bar=args.disable_prog_bar)
        anno_time = time.time() - start_anno
        start_writing = time.time()
        writePangenome(pangenome,
                       filename,
                       args.force,
                       disable_bar=args.disable_prog_bar)
        writing_time = time.time() - start_writing
        start_clust = time.time()
        clustering(pangenome,
                   args.tmpdir,
                   args.cpu,
                   identity=args.identity,
                   coverage=args.coverage,
                   mode=args.mode,
                   defrag=not args.no_defrag,
                   disable_bar=args.disable_prog_bar)
        clust_time = time.time() - start_clust

    writePangenome(pangenome,
                   filename,
                   args.force,
                   disable_bar=args.disable_prog_bar)
    start_graph = time.time()
    computeNeighborsGraph(pangenome, disable_bar=args.disable_prog_bar)
    graph_time = time.time() - start_graph

    start_part = time.time()
    partition(pangenome,
              tmpdir=args.tmpdir,
              cpu=args.cpu,
              K=args.nb_of_partitions,
              disable_bar=args.disable_prog_bar)
    part_time = time.time() - start_part

    start_writing = time.time()
    writePangenome(pangenome,
                   filename,
                   args.force,
                   disable_bar=args.disable_prog_bar)
    writing_time = writing_time + time.time() - start_writing

    start_regions = time.time()
    predictRGP(pangenome, disable_bar=args.disable_prog_bar)
    regions_time = time.time() - start_regions

    start_spots = time.time()
    predictHotspots(pangenome, args.output, disable_bar=args.disable_prog_bar)
    spot_time = time.time() - start_spots

    start_mods = time.time()
    predictModules(pangenome=pangenome,
                   cpu=args.cpu,
                   tmpdir=args.tmpdir,
                   disable_bar=args.disable_prog_bar)
    mod_time = time.time() - start_mods

    start_writing = time.time()
    writePangenome(pangenome,
                   filename,
                   args.force,
                   disable_bar=args.disable_prog_bar)
    writing_time = writing_time + time.time() - start_writing

    if not args.only_pangenome:
        start_spot_drawing = time.time()
        mkOutdir(args.output + '/spot_figures', force=True)
        drawSpots(pangenome=pangenome,
                  output=args.output + '/spot_figures',
                  spot_list='all',
                  disable_bar=args.disable_prog_bar)
        spot_time = spot_time + time.time() - start_spot_drawing

        if args.rarefaction:
            makeRarefactionCurve(pangenome,
                                 args.output,
                                 args.tmpdir,
                                 cpu=args.cpu,
                                 disable_bar=args.disable_prog_bar)
        if 1 < len(pangenome.organisms) < 5000:
            drawTilePlot(
                pangenome,
                args.output,
                nocloud=False if len(pangenome.organisms) < 500 else True)
        drawUCurve(pangenome, args.output)

        start_desc = time.time()
        writeFlatFiles(pangenome,
                       args.output,
                       args.cpu,
                       csv=True,
                       genePA=True,
                       gexf=True,
                       light_gexf=True,
                       projection=True,
                       json=True,
                       stats=True,
                       partitions=True,
                       regions=True,
                       spots=True,
                       borders=True,
                       spot_modules=True,
                       modules=True)
        desc_time = time.time() - start_desc

    logging.getLogger().info(
        f"Annotation took : {round(anno_time, 2)} seconds")
    logging.getLogger().info(
        f"Clustering took : {round(clust_time, 2)} seconds")
    logging.getLogger().info(
        f"Building the graph took : {round(graph_time, 2)} seconds")
    logging.getLogger().info(
        f"Partitioning the pangenome took : {round(part_time, 2)} seconds")
    logging.getLogger().info(
        f"Predicting RGP took : {round(regions_time, 2)} seconds")
    logging.getLogger().info(
        f"Gathering RGP into spots took : {round(spot_time, 2)} seconds")
    logging.getLogger().info(
        f"Predicting modules took : {round(mod_time, 2)} seconds")
    logging.getLogger().info(
        f"Writing the pangenome data in HDF5 took : {round(writing_time, 2)} seconds"
    )
    if not args.only_pangenome:
        logging.getLogger().info(
            f"Writing descriptive files for the pangenome took : {round(desc_time, 2)} seconds"
        )
    printInfo(filename, content=True)
Exemple #17
0
def launch(args):
    mkOutdir(args.output, args.force)
    pangenome = Pangenome()
    pangenome.addFile(args.pangenome)
    align(pangenome, args.proteins, args.output, args.tmpdir, args.identity,
          args.coverage, args.defrag, args.cpu)