def profile_by_query(filename, genome_id, selected_loci, database): # TODO: collect new alleles from here allele_ids = ",".join("'{}'".format(operations.make_seqid(rec.seq)) for rec in SeqIO.parse(filename, "fasta")) locus_ids = ",".join("'{}'".format(x) for x in selected_loci) query = "select locus_id, allele_id from sequence where allele_id in ({}) and locus_id in ({});".format(allele_ids, locus_ids) profile = sql_query(query, database=database).drop_duplicates("allele_id") # ensure allele_id is mapped only once profile = profile.drop_duplicates("locus_id").set_index("locus_id") # ensure locus_id exists only once profile = profile.rename(columns={"allele_id": genome_id}).iloc[:, 0] return profile
def get(self, id): if len(id) != 32: abort(404) sql = "select * from upload where batch_id='{}';".format(id) results = db.sql_query(sql, database=DB).to_dict(orient="records") if len(results) == 0: abort(404) Thread(target=internals.profiling_api, args=(id, "Salmonella_5k", 95), daemon=True).start() return {"message": "Profiling dataset {}".format(id)}, 200
def get(self, id): sql = None if len(id) == 32: sql = "select seq_id, batch_id, filename from upload where batch_id='{}';".format( id) elif len(id) == 64: sql = "select seq_id, batch_id, filename from upload where seq_id='{}';".format( id) else: abort(404) results = db.sql_query(sql, database=DB).to_dict(orient="records") if len(results) != 0: return results else: abort(404)
def profiling(output_dir, input_dir, database, threads, occr_level=None, selected_loci=None, logger=None, aligcov_cut=0.5, identity=90): load_database_config() if not logger: logger = logs.console_logger(__name__) logger.info("Renaming contigs...") query_dir = files.joinpath(output_dir, "query") files.create_if_not_exist(query_dir) namemap = rename(query_dir, input_dir) with open(files.joinpath(output_dir, "namemap.json"), "w") as f: f.write(json.dumps(namemap)) if os.path.isdir(database): logger.info("Profiling loci...") refseq_fna = files.joinpath(database, "panRefSeq.fa") profile_loci(refseq_fna, query_dir, output_dir, aligcov_cut, identity, threads) logger.info("Allocating alleles...") profile_alleles(query_dir, database, output_dir, threads, occr_level) else: logger.info("Identifying loci and allocating alleles...") # select loci by scheme if selected_loci: selected_loci = set(selected_loci) else: query = "select locus_id from scheme where occurence>={};".format(occr_level) selected_loci = set(sql_query(query, database=database).iloc[:, 0]) temp_dir = os.path.join(query_dir, "temp") files.create_if_not_exist(temp_dir) collect = [] args = [(os.path.join(query_dir, filename), temp_dir) for filename in os.listdir(query_dir) if filename.endswith(".fa")] with ProcessPoolExecutor(threads) as executor: for filename in executor.map(identify_loci, args): genome_id = files.fasta_filename(filename) target_file = os.path.join(temp_dir, genome_id + ".locus.fna") profile = profile_by_query(target_file, genome_id, selected_loci, database) collect.append(profile) result = pd.concat(collect, axis=1) result.to_csv(files.joinpath(output_dir, "wgmlst.tsv"), sep="\t") shutil.rmtree(query_dir)
def get(self): sql = "select seq_id, filename from upload;" results = db.sql_query(sql, database=DB).to_dict(orient="records") return results
def get(self): sql = "select id from dendrogram;" results = db.sql_query(sql, database=DB).to_dict(orient="records") return results
def get(self): sql = "select id, occurrence, database from profile;" results = db.sql_query(sql, database=DB).to_dict(orient="records") return results