Ejemplo n.º 1
0
def profiling_api(batch_id, database, occr_level):
    input_dir = os.path.join(INDIR, batch_id)
    files.create_if_not_exist(OUTDIR)
    output_dir = os.path.join(OUTDIR, batch_id)
    files.create_if_not_exist(output_dir)
    wgmlst.profiling(output_dir, input_dir, database, occr_level=occr_level, threads=2)
    profile_created = datetime.datetime.now()

    with open(os.path.join(output_dir, "namemap.json"), "r") as file:
        names = json.loads(file.read())
    profile_filename = os.path.join(output_dir,
                                    "cgMLST_{}_{}_{}.tsv".format(database, occr_level, batch_id[0:8]))
    os.rename(os.path.join(output_dir, "wgmlst.tsv"), profile_filename)
    dendro = phylotree.Dendrogram()
    dendro.make_tree(profile_filename, names)
    dendro_created = datetime.datetime.now()
    newick_filename = os.path.join(output_dir, "dendrogram_{}.newick".format(batch_id[0:8]))
    dendro.to_newick(newick_filename)
    pdf_filename = os.path.join(output_dir, "dendrogram_{}.pdf".format(batch_id[0:8]))
    dendro.scipy_tree(pdf_filename)
    svg_filename = os.path.join(output_dir, "dendrogram_{}.svg".format(batch_id[0:8]))
    dendro.scipy_tree(svg_filename)
    png_filename = os.path.join(output_dir, "dendrogram_{}.png".format(batch_id[0:8]))
    dendro.scipy_tree(png_filename)

    sql = "INSERT INTO profile (id,created,file,occurrence,database) VALUES(%s,%s,%s,%s,%s);"
    data = (batch_id, profile_created, profile_filename, occr_level, database)
    db.to_sql(sql, data, database="profiling")

    sql = "INSERT INTO dendrogram (id,created,png_file,pdf_file,svg_file,newick_file) VALUES(%s,%s,%s,%s,%s,%s);"
    data = (batch_id, dendro_created, png_filename, pdf_filename, svg_filename, newick_filename)
    db.to_sql(sql, data, database="profiling")
Ejemplo n.º 2
0
    def make_profile(self):
        """take file from profileSelectText for profiling"""
        switch_widgets = [self.profileSelector, self.profileSelectText, self.joblist_2, self.runButton]
        self.disable(switch_widgets)

        # setup paths
        option = str(self.joblist_2.currentItem().text())
        database_dir = files.joinpath(self.data_dir, JobType.PGDB.to_str(), option, "DB")
        profile_dir = str(self.profileSelectText.toPlainText())
        query_dir = files.joinpath(self.data_dir, JobType.WGMLST.to_str())
        files.create_if_not_exist(query_dir)

        # create new job
        self.jobmgr = JobManager(self.data_dir)
        jobid, job_dir = self.jobmgr.start_job(JobType.WGMLST)
        self.jobmgr.close()

        # setup logger
        factory = LoggerFactory()
        factory.addLogBoxHandler(self.logbox_2)  # TODO: bug -- crash after Worker is done.
        factory.addFileHandler(files.joinpath(query_dir, "log_" + jobid + ".txt"))
        logger = factory.create()

        # process algorithms
        wgmlst.profiling(job_dir, profile_dir, database_dir, logger)

        self.enable(switch_widgets)
Ejemplo n.º 3
0
    def make_database(self):
        """take files from fileSelectText for making database"""
        switch_widgets = [self.fileSubmitter, self.fileSelectText, self.fileSelector]
        self.disable(switch_widgets)

        # setup paths
        source_dir = str(self.fileSelectText.toPlainText())
        database_dir = files.joinpath(self.data_dir, JobType.PGDB.to_str())
        files.create_if_not_exist(database_dir)

        # create new job
        self.jobmgr = JobManager(self.data_dir)
        jobid, job_dir = self.jobmgr.start_job(JobType.PGDB)
        self.jobmgr.close()

        # setup logger
        factory = LoggerFactory()
        factory.addLogBoxHandler(self.logbox_1)  # TODO: bug -- crash after Worker is done.
        factory.addFileHandler(files.joinpath(database_dir, "log_" + jobid + ".txt"))
        logger = factory.create()

        # process algorithms
        pgdb.annotate_configs(source_dir, job_dir, logger=logger)
        pgdb.make_database(job_dir, logger=logger)

        self.enable(switch_widgets)
Ejemplo n.º 4
0
def annotate_configs(input_dir, output_dir, logger=None, threads=8, use_docker=True):
    if not logger:
        logger = logs.console_logger(__name__)

    logger.info("Formating contigs...")
    filenames = parse_filenames(input_dir)

    genome_dir = files.joinpath(output_dir, "Genomes")
    files.create_if_not_exist(genome_dir)
    namemap = format_contigs(filenames, input_dir, genome_dir)
    with open(files.joinpath(output_dir, "namemap.json"), "w") as f:
        f.write(json.dumps(namemap))

    logger.info("Annotating...")
    annotate_dir = files.joinpath(output_dir, "Annotated")
    files.create_if_not_exist(annotate_dir)
    if use_docker:
        docker.prokka(genome_dir, annotate_dir)
    else:
        c = [cmds.form_prokka_cmd(x, genome_dir, annotate_dir) for x in namemap.values()]
        with ProcessPoolExecutor(int(threads / 2)) as executor:
            executor.map(os.system, c)

    logger.info("Moving protein CDS (.ffn) files...")
    ffn_dir = files.joinpath(output_dir, "FFN")
    files.create_if_not_exist(ffn_dir)
    move_file(annotate_dir, ffn_dir, ".ffn")

    logger.info("Moving annotation (.gff) files...")
    gff_dir = files.joinpath(output_dir, "GFF")
    files.create_if_not_exist(gff_dir)
    move_file(annotate_dir, gff_dir, ".gff")

    logger.info("Creating nonCDS.json...")
    create_noncds(output_dir, gff_dir)
Ejemplo n.º 5
0
def profiling(output_dir, input_dir, database, threads, occr_level=None, selected_loci=None, logger=None,
              aligcov_cut=0.5, identity=90):
    load_database_config()
    if not logger:
        logger = logs.console_logger(__name__)

    logger.info("Renaming contigs...")
    query_dir = files.joinpath(output_dir, "query")
    files.create_if_not_exist(query_dir)
    namemap = rename(query_dir, input_dir)
    with open(files.joinpath(output_dir, "namemap.json"), "w") as f:
        f.write(json.dumps(namemap))

    if os.path.isdir(database):
        logger.info("Profiling loci...")
        refseq_fna = files.joinpath(database, "panRefSeq.fa")
        profile_loci(refseq_fna, query_dir, output_dir, aligcov_cut, identity, threads)

        logger.info("Allocating alleles...")
        profile_alleles(query_dir, database, output_dir, threads, occr_level)
    else:
        logger.info("Identifying loci and allocating alleles...")

        # select loci by scheme
        if selected_loci:
            selected_loci = set(selected_loci)
        else:
            query = "select locus_id from scheme where occurence>={};".format(occr_level)
            selected_loci = set(sql_query(query, database=database).iloc[:, 0])

        temp_dir = os.path.join(query_dir, "temp")
        files.create_if_not_exist(temp_dir)

        collect = []
        args = [(os.path.join(query_dir, filename), temp_dir) for filename in os.listdir(query_dir) if filename.endswith(".fa")]
        with ProcessPoolExecutor(threads) as executor:
            for filename in executor.map(identify_loci, args):
                genome_id = files.fasta_filename(filename)
                target_file = os.path.join(temp_dir, genome_id + ".locus.fna")
                profile = profile_by_query(target_file, genome_id, selected_loci, database)
                collect.append(profile)
        result = pd.concat(collect, axis=1)
        result.to_csv(files.joinpath(output_dir, "wgmlst.tsv"), sep="\t")

    shutil.rmtree(query_dir)
Ejemplo n.º 6
0
    def __init__(self, mainTab):
        super(Window, self).__init__()
        self.setupUi(mainTab)

        current_dir = os.path.dirname(__file__)
        self.ROOT_DIR = os.path.abspath(os.path.join(current_dir, os.pardir))
        self.data_dir = os.path.join(self.ROOT_DIR, "data")
        files.create_if_not_exist(self.data_dir)

        self.pool = worker.ThreadPool()

        # setting behaviers
        self.fileSelector.clicked.connect(lambda: self.select_dir(self.fileSelectText))
        self.fileSubmitter.clicked.connect(lambda: self.pool.start(self.make_database, ()))

        self.profileSelector.clicked.connect(lambda: self.select_profiles(self.profileSelectText, self.joblist_2))
        self.runButton.clicked.connect(lambda: self.pool.start(self.make_profile, ()))

        self.plottingSelector.clicked.connect(lambda: self.select_dir(self.plottingSelectText))
        self.plottingSelectText.textChanged.connect(lambda: self.plotDendrogram())
Ejemplo n.º 7
0
def make_database(output_dir, logger=None, threads=2, use_docker=True):
    if not logger:
        logger = logs.console_logger(__name__)

    database_dir = files.joinpath(output_dir, "database")
    files.create_if_not_exist(database_dir)

    logger.info("Calculating the pan genome...")
    min_identity = 95
    if use_docker:
        docker.roary(files.joinpath(output_dir, "GFF"), output_dir, min_identity, threads)
    else:
        c = cmds.form_roary_cmd(files.joinpath(output_dir, "GFF"), output_dir, min_identity, threads)
        os.system(c)

    logger.info("Extract profiles from roary result matrix...")
    matrix_file = files.joinpath(output_dir, "roary", "gene_presence_absence.csv")
    locusmeta_file = files.joinpath(database_dir, "locus_metadata.tsv")
    paralogmeta_file = files.joinpath(database_dir, "paralog_metadata.tsv")
    profiles, total_isolates = extract_profiles(matrix_file, locusmeta_file, paralogmeta_file)

    logger.info("Collecting allele profiles and making allele frequencies and reference sequence...")
    ffn_dir = files.joinpath(output_dir, "FFN")
    profile_file = files.joinpath(database_dir, "allele_profiles.tsv")
    profiles, freq = collect_allele_infos(profiles, ffn_dir)
    profiles.to_csv(profile_file, sep="\t")

    refseq_file = files.joinpath(database_dir, "panRefSeq.fa")
    refseqs = save_refseq(freq, refseq_file)

    locus_dir = files.joinpath(database_dir, "locusfiles")
    files.create_if_not_exist(locus_dir)
    save_locusfiles(freq, locus_dir)

    allele_freq_file = files.joinpath(database_dir, "allele_frequency.json")
    save_allele_freq(freq, allele_freq_file)

    logger.info("Making dynamic schemes...")
    scheme_file = files.joinpath(database_dir, "scheme.tsv")
    make_schemes(locusmeta_file, scheme_file, refseqs, total_isolates)
    logger.info("Done!!")