Ejemplo n.º 1
0
def calculate_allele_length(output_dir, database, interval=20):
    lf = logs.LoggerFactory()
    lf.addConsoleHandler()
    logger = lf.create()
    db.load_database_config(logger=logger)
    logger.info("Start calculating allele length heatmap...")
    plot_length_heamap(output_dir, database, interval=interval)
Ejemplo n.º 2
0
def calculate_loci_coverage(input_dir, output_dir, database):
    lf = logs.LoggerFactory()
    lf.addConsoleHandler()
    logger = lf.create()
    db.load_database_config(logger=logger)
    logger.info("Start calculating locus coverage...")
    subject_number = count_subjects(input_dir)
    logger.info("Start plotting locus coverage...")
    plot_stats(output_dir, subject_number, database)
Ejemplo n.º 3
0
def power(database):
    lf = logs.LoggerFactory()
    lf.addConsoleHandler()
    logger = lf.create()
    db.load_database_config(logger=logger)
    sql = "select locus_id, count(locus_id) as counts from pairs group by locus_id;"
    counts = db.from_sql(sql, database=database)
    counts["log_counts"] = np.log2(counts["counts"])
    return np.sum(counts["log_counts"])
Ejemplo n.º 4
0
def profiling(output_dir, input_dir, database, threads, occr_level=None, selected_loci=None, logger=None,
              aligcov_cut=0.5, identity=90):
    load_database_config()
    if not logger:
        logger = logs.console_logger(__name__)

    logger.info("Renaming contigs...")
    query_dir = files.joinpath(output_dir, "query")
    files.create_if_not_exist(query_dir)
    namemap = rename(query_dir, input_dir)
    with open(files.joinpath(output_dir, "namemap.json"), "w") as f:
        f.write(json.dumps(namemap))

    if os.path.isdir(database):
        logger.info("Profiling loci...")
        refseq_fna = files.joinpath(database, "panRefSeq.fa")
        profile_loci(refseq_fna, query_dir, output_dir, aligcov_cut, identity, threads)

        logger.info("Allocating alleles...")
        profile_alleles(query_dir, database, output_dir, threads, occr_level)
    else:
        logger.info("Identifying loci and allocating alleles...")

        # select loci by scheme
        if selected_loci:
            selected_loci = set(selected_loci)
        else:
            query = "select locus_id from scheme where occurence>={};".format(occr_level)
            selected_loci = set(sql_query(query, database=database).iloc[:, 0])

        temp_dir = os.path.join(query_dir, "temp")
        files.create_if_not_exist(temp_dir)

        collect = []
        args = [(os.path.join(query_dir, filename), temp_dir) for filename in os.listdir(query_dir) if filename.endswith(".fa")]
        with ProcessPoolExecutor(threads) as executor:
            for filename in executor.map(identify_loci, args):
                genome_id = files.fasta_filename(filename)
                target_file = os.path.join(temp_dir, genome_id + ".locus.fna")
                profile = profile_by_query(target_file, genome_id, selected_loci, database)
                collect.append(profile)
        result = pd.concat(collect, axis=1)
        result.to_csv(files.joinpath(output_dir, "wgmlst.tsv"), sep="\t")

    shutil.rmtree(query_dir)
Ejemplo n.º 5
0
def richness(database, weighted=True):
    lf = logs.LoggerFactory()
    lf.addConsoleHandler()
    logger = lf.create()
    db.load_database_config(logger=logger)
    sql = "select a.locus_id, a.allele_id, b.count" \
          " from pairs as a" \
          " left join (select allele_id, count from alleles) as b" \
          " on a.allele_id=b.allele_id;"
    counts = db.from_sql(sql, database=database)
    ent = counts.groupby("locus_id").agg({"count": locus_entropy})
    if weighted:
        sql = "select locus_id, occurrence from loci;"
        loci = db.from_sql(sql, database=database)
        weight = pd.merge(ent, loci, left_index=True, right_on="locus_id")
        return np.average(weight["count"], weights=weight["occurrence"])
    else:
        return np.average(ent)
Ejemplo n.º 6
0
def make_database(output_dir, drop_by_occur, logger=None, threads=2):
    if not logger:
        lf = logs.LoggerFactory()
        lf.addConsoleHandler()
        lf.addFileHandler(os.path.join(output_dir, "make_database.log"))
        logger = lf.create()
    db.load_database_config(logger=logger)

    logger.info("Calculating the pan genome...")
    min_identity = 95
    c = cmds.form_roary_cmd(os.path.join(output_dir, "GFF"), output_dir,
                            min_identity, threads)
    logger.info("Run roary with following command: " + c)
    subprocess.run(c, shell=True)

    logger.info("Creating database")
    dbname = os.path.basename(
        output_dir[:-1] if output_dir.endswith("/") else output_dir)
    db.createdb(dbname)
    db.create_pgadb_relations(dbname)

    logger.info("Extract profiles from roary result matrix...")
    matrix_file = os.path.join(output_dir, "roary",
                               "gene_presence_absence.csv")
    profiles, total_isolates = extract_profiles(matrix_file, dbname)

    logger.info(
        "Collecting allele profiles and making allele frequencies and reference sequence..."
    )
    ffn_dir = os.path.join(output_dir, "FFN")
    profile_file = os.path.join(output_dir, "allele_profiles.tsv")
    profiles, freq = collect_allele_info(profiles, ffn_dir)

    logger.info("Checking duplicated loci by self-blastp...")
    blastp_out_file, ref_length = reference_self_blastp(output_dir, freq)

    logger.info(
        "Filter out high identity loci and drop loci which occurrence less than {}..."
        .format(drop_by_occur))
    filtered_loci = filter_locus(blastp_out_file, ref_length, total_isolates,
                                 drop_by_occur)
    os.remove(blastp_out_file)

    logger.info("Updating and saving profiles...")
    freq = {l: freq[l] for l in filtered_loci}
    profiles = profiles[profiles.index.isin(filtered_loci)]
    profiles.to_csv(profile_file, sep="\t")

    logger.info("Saving allele sequences...")
    refseqs = {
        locus: counter.most_common(1)[0][0]
        for locus, counter in freq.items()
    }
    save_sequences(freq, refseqs, dbname)

    logger.info("Making dynamic schemes...")
    refseqs = dict(
        map(lambda x: (x[0], operations.make_seqid(x[1])), refseqs.items()))
    make_schemes(refseqs, total_isolates)
    logger.info("Done!!")
    return dbname
Ejemplo n.º 7
0
def profiling(output_dir,
              input_dir,
              database,
              threads,
              occr_level=None,
              selected_loci=None,
              profile_file="profile",
              enable_adding_new_alleles=True,
              generate_profiles=True,
              generate_bn=True,
              logger=None,
              debug=False):
    if not logger:
        lf = logs.LoggerFactory()
        lf.addConsoleHandler()
        lf.addFileHandler(os.path.join(output_dir, "profiling.log"))
        logger = lf.create()
    db.load_database_config(logger=logger)
    pid = uuid.uuid4().hex[0:8]

    logger.info("Formating contigs...")
    query_dir = os.path.join(output_dir, "query_{}".format(pid))
    os.makedirs(query_dir, exist_ok=True)
    contighandler = files.ContigHandler()
    contighandler.new_format(input_dir, query_dir)

    model = re.search('[a-zA-Z]+\w[a-zA-Z]+', database).group(0)
    logger.info("Used model: {}".format(model))

    logger.info("Selecting loci by specified scheme {}%...".format(occr_level))
    if selected_loci:
        selected_loci = set(selected_loci)
    else:  # select loci by scheme
        query = "select locus_id from loci where occurrence>={};".format(
            occr_level)
        selected_loci = set(db.from_sql(query, database=database).iloc[:, 0])

    logger.info("Making reference blastdb for blastp...")
    temp_dir = os.path.join(query_dir, "temp_{}".format(pid))
    os.makedirs(temp_dir, exist_ok=True)
    ref_db = os.path.join(temp_dir, "ref_blastpdb_{}".format(pid))
    ref_len = make_ref_blastpdb(ref_db, database)

    logger.info("Identifying loci and allocating alleles...")
    args = [(os.path.join(query_dir, filename), temp_dir, model)
            for filename in os.listdir(query_dir) if filename.endswith(".fa")]
    with ThreadPoolExecutor(threads) as executor:
        id_allele_list = list(executor.map(identify_alleles, args))

    if enable_adding_new_alleles:
        logger.info("Adding new alleles to database...")
        add_new_alleles(id_allele_list, ref_db, temp_dir, ref_len)

    logger.info("Collecting allele profiles of each genomes...")
    allele_counts = Counter()
    if generate_profiles:
        collect = []
        for genome_id, alleles in id_allele_list:
            profile = profile_by_query(alleles, genome_id, selected_loci,
                                       database)
            collect.append(profile)
            allele_counts.update(alleles.keys())
        result = pd.concat(collect, axis=1)
        result.to_csv(os.path.join(output_dir, profile_file + ".tsv"),
                      sep="\t")
        if generate_bn:
            bio = to_bionumerics_format(result)
            bio.to_csv(os.path.join(output_dir,
                                    "bionumerics_{}.csv".format(pid)),
                       index=False)
    else:
        logger.info("Not going to output profiles.")
        for genome_id, alleles in id_allele_list:
            allele_counts.update(alleles.keys())

    allele_counts = pd.DataFrame(allele_counts, index=[0]).T\
        .reset_index().rename(columns={"index": "allele_id", 0: "count"})
    update_allele_counts(allele_counts, database,
                         "batch_add_counts_{}".format(pid))
    if not debug and os.path.exists(query_dir):
        shutil.rmtree(query_dir)
    logger.info("Done!")
Ejemplo n.º 8
0
from flask import abort, send_file
from flask_restful import Resource, reqparse
from werkzeug.datastructures import FileStorage
import psycopg2
from datetime import datetime
import hashlib
import os
from threading import Thread
from src.api import internals
from src.utils import db

INDIR = "input"
OUTDIR = "output"
DB = "profiling"
db.load_database_config()


def create_if_not_exist(path):
    if not os.path.exists(path):
        os.makedirs(path)


def get_seq_id(file):
    file.seek(0)
    m = hashlib.sha256()
    m.update(file.read())
    return m.hexdigest()


class UploadListAPI(Resource):
    def __init__(self):