def collect_allele_info(profiles, ffn_dir): freq = defaultdict(Counter) new_profiles = [] for subject, profile in profiles.iteritems(): ffn_file = os.path.join(ffn_dir, "{}.ffn".format(subject)) seqs = { record.id: record.seq for record in SeqIO.parse(ffn_file, "fasta") } new_profile = pd.Series(name=subject) for locus, prokka_str in profile.dropna().iteritems(): if "\t" not in prokka_str: allele = seqs[prokka_str] freq[locus].update([allele]) new_profile.at[locus] = operations.make_seqid(allele) else: prokka_id = prokka_str.split("\t") alleles = [seqs[x] for x in prokka_id] freq[locus].update(alleles) v = "\t".join(operations.make_seqid(x) for x in alleles) new_profile.at[locus] = v new_profiles.append(new_profile) new_profiles = pd.concat(new_profiles, axis=1).sort_index().sort_index(axis=1) return new_profiles, freq
def profile_by_query(filename, genome_id, selected_loci, database): # TODO: collect new alleles from here allele_ids = ",".join("'{}'".format(operations.make_seqid(rec.seq)) for rec in SeqIO.parse(filename, "fasta")) locus_ids = ",".join("'{}'".format(x) for x in selected_loci) query = "select locus_id, allele_id from sequence where allele_id in ({}) and locus_id in ({});".format(allele_ids, locus_ids) profile = sql_query(query, database=database).drop_duplicates("allele_id") # ensure allele_id is mapped only once profile = profile.drop_duplicates("locus_id").set_index("locus_id") # ensure locus_id exists only once profile = profile.rename(columns={"allele_id": genome_id}).iloc[:, 0] return profile
def identify_alleles(args): filename, out_dir, model = args subprocess.run(cmds.form_prodigal_cmd(filename, out_dir, model), shell=True) genome_id = files.fasta_filename(filename) target_file = os.path.join(out_dir, genome_id + ".locus.fna") alleles = { operations.make_seqid(rec.seq): (rec.seq, rec.seq.translate(table=11)) for rec in SeqIO.parse(target_file, "fasta") } return genome_id, alleles
def save_sequences(freq, refseqs, dbname): alleles = [] pairs = [] for locus, counter in freq.items(): allele = refseqs[locus] count = 0 dna_seq = str(allele) pept_seq = str(allele.translate(table=11)) allele_id = operations.make_seqid(dna_seq) alleles.append((allele_id, dna_seq, pept_seq, count)) pairs.append((allele_id, locus)) to_allele_table(alleles, dbname) to_pair_table(pairs, dbname)
def save_allele_freq(freq, allele_freq_file): allele_freq = {} for locus, counter in freq.items(): allele_freq[locus] = {operations.make_seqid(str(allele)): count for allele, count in counter.items()} with open(allele_freq_file, "w") as file: file.write(json.dumps(allele_freq))
def save_locusfiles(freq, locus_dir): for locus, counter in freq.items(): records = [seq.new_record(operations.make_seqid(str(allele)), allele) for allele in counter.keys()] seq.save_records(records, files.joinpath(locus_dir, locus + ".fa"))
def make_database(output_dir, drop_by_occur, logger=None, threads=2): if not logger: lf = logs.LoggerFactory() lf.addConsoleHandler() lf.addFileHandler(os.path.join(output_dir, "make_database.log")) logger = lf.create() db.load_database_config(logger=logger) logger.info("Calculating the pan genome...") min_identity = 95 c = cmds.form_roary_cmd(os.path.join(output_dir, "GFF"), output_dir, min_identity, threads) logger.info("Run roary with following command: " + c) subprocess.run(c, shell=True) logger.info("Creating database") dbname = os.path.basename( output_dir[:-1] if output_dir.endswith("/") else output_dir) db.createdb(dbname) db.create_pgadb_relations(dbname) logger.info("Extract profiles from roary result matrix...") matrix_file = os.path.join(output_dir, "roary", "gene_presence_absence.csv") profiles, total_isolates = extract_profiles(matrix_file, dbname) logger.info( "Collecting allele profiles and making allele frequencies and reference sequence..." ) ffn_dir = os.path.join(output_dir, "FFN") profile_file = os.path.join(output_dir, "allele_profiles.tsv") profiles, freq = collect_allele_info(profiles, ffn_dir) logger.info("Checking duplicated loci by self-blastp...") blastp_out_file, ref_length = reference_self_blastp(output_dir, freq) logger.info( "Filter out high identity loci and drop loci which occurrence less than {}..." .format(drop_by_occur)) filtered_loci = filter_locus(blastp_out_file, ref_length, total_isolates, drop_by_occur) os.remove(blastp_out_file) logger.info("Updating and saving profiles...") freq = {l: freq[l] for l in filtered_loci} profiles = profiles[profiles.index.isin(filtered_loci)] profiles.to_csv(profile_file, sep="\t") logger.info("Saving allele sequences...") refseqs = { locus: counter.most_common(1)[0][0] for locus, counter in freq.items() } save_sequences(freq, refseqs, dbname) logger.info("Making dynamic schemes...") refseqs = dict( map(lambda x: (x[0], operations.make_seqid(x[1])), refseqs.items())) make_schemes(refseqs, total_isolates) logger.info("Done!!") return dbname