Ejemplo n.º 1
0
def collect_allele_info(profiles, ffn_dir):
    freq = defaultdict(Counter)
    new_profiles = []
    for subject, profile in profiles.iteritems():
        ffn_file = os.path.join(ffn_dir, "{}.ffn".format(subject))
        seqs = {
            record.id: record.seq
            for record in SeqIO.parse(ffn_file, "fasta")
        }

        new_profile = pd.Series(name=subject)
        for locus, prokka_str in profile.dropna().iteritems():
            if "\t" not in prokka_str:
                allele = seqs[prokka_str]
                freq[locus].update([allele])
                new_profile.at[locus] = operations.make_seqid(allele)
            else:
                prokka_id = prokka_str.split("\t")
                alleles = [seqs[x] for x in prokka_id]
                freq[locus].update(alleles)
                v = "\t".join(operations.make_seqid(x) for x in alleles)
                new_profile.at[locus] = v
        new_profiles.append(new_profile)
    new_profiles = pd.concat(new_profiles,
                             axis=1).sort_index().sort_index(axis=1)
    return new_profiles, freq
Ejemplo n.º 2
0
def profile_by_query(filename, genome_id, selected_loci, database):
    # TODO: collect new alleles from here
    allele_ids = ",".join("'{}'".format(operations.make_seqid(rec.seq)) for rec in SeqIO.parse(filename, "fasta"))
    locus_ids = ",".join("'{}'".format(x) for x in selected_loci)
    query = "select locus_id, allele_id from sequence where allele_id in ({}) and locus_id in ({});".format(allele_ids, locus_ids)
    profile = sql_query(query, database=database).drop_duplicates("allele_id")  # ensure allele_id is mapped only once
    profile = profile.drop_duplicates("locus_id").set_index("locus_id")  # ensure locus_id exists only once
    profile = profile.rename(columns={"allele_id": genome_id}).iloc[:, 0]
    return profile
Ejemplo n.º 3
0
def identify_alleles(args):
    filename, out_dir, model = args
    subprocess.run(cmds.form_prodigal_cmd(filename, out_dir, model),
                   shell=True)
    genome_id = files.fasta_filename(filename)
    target_file = os.path.join(out_dir, genome_id + ".locus.fna")
    alleles = {
        operations.make_seqid(rec.seq): (rec.seq, rec.seq.translate(table=11))
        for rec in SeqIO.parse(target_file, "fasta")
    }
    return genome_id, alleles
Ejemplo n.º 4
0
def save_sequences(freq, refseqs, dbname):
    alleles = []
    pairs = []
    for locus, counter in freq.items():
        allele = refseqs[locus]
        count = 0
        dna_seq = str(allele)
        pept_seq = str(allele.translate(table=11))
        allele_id = operations.make_seqid(dna_seq)
        alleles.append((allele_id, dna_seq, pept_seq, count))
        pairs.append((allele_id, locus))
    to_allele_table(alleles, dbname)
    to_pair_table(pairs, dbname)
Ejemplo n.º 5
0
def save_allele_freq(freq, allele_freq_file):
    allele_freq = {}
    for locus, counter in freq.items():
        allele_freq[locus] = {operations.make_seqid(str(allele)): count for allele, count in counter.items()}
    with open(allele_freq_file, "w") as file:
        file.write(json.dumps(allele_freq))
Ejemplo n.º 6
0
def save_locusfiles(freq, locus_dir):
    for locus, counter in freq.items():
        records = [seq.new_record(operations.make_seqid(str(allele)), allele) for allele in counter.keys()]
        seq.save_records(records, files.joinpath(locus_dir, locus + ".fa"))
Ejemplo n.º 7
0
def make_database(output_dir, drop_by_occur, logger=None, threads=2):
    if not logger:
        lf = logs.LoggerFactory()
        lf.addConsoleHandler()
        lf.addFileHandler(os.path.join(output_dir, "make_database.log"))
        logger = lf.create()
    db.load_database_config(logger=logger)

    logger.info("Calculating the pan genome...")
    min_identity = 95
    c = cmds.form_roary_cmd(os.path.join(output_dir, "GFF"), output_dir,
                            min_identity, threads)
    logger.info("Run roary with following command: " + c)
    subprocess.run(c, shell=True)

    logger.info("Creating database")
    dbname = os.path.basename(
        output_dir[:-1] if output_dir.endswith("/") else output_dir)
    db.createdb(dbname)
    db.create_pgadb_relations(dbname)

    logger.info("Extract profiles from roary result matrix...")
    matrix_file = os.path.join(output_dir, "roary",
                               "gene_presence_absence.csv")
    profiles, total_isolates = extract_profiles(matrix_file, dbname)

    logger.info(
        "Collecting allele profiles and making allele frequencies and reference sequence..."
    )
    ffn_dir = os.path.join(output_dir, "FFN")
    profile_file = os.path.join(output_dir, "allele_profiles.tsv")
    profiles, freq = collect_allele_info(profiles, ffn_dir)

    logger.info("Checking duplicated loci by self-blastp...")
    blastp_out_file, ref_length = reference_self_blastp(output_dir, freq)

    logger.info(
        "Filter out high identity loci and drop loci which occurrence less than {}..."
        .format(drop_by_occur))
    filtered_loci = filter_locus(blastp_out_file, ref_length, total_isolates,
                                 drop_by_occur)
    os.remove(blastp_out_file)

    logger.info("Updating and saving profiles...")
    freq = {l: freq[l] for l in filtered_loci}
    profiles = profiles[profiles.index.isin(filtered_loci)]
    profiles.to_csv(profile_file, sep="\t")

    logger.info("Saving allele sequences...")
    refseqs = {
        locus: counter.most_common(1)[0][0]
        for locus, counter in freq.items()
    }
    save_sequences(freq, refseqs, dbname)

    logger.info("Making dynamic schemes...")
    refseqs = dict(
        map(lambda x: (x[0], operations.make_seqid(x[1])), refseqs.items()))
    make_schemes(refseqs, total_isolates)
    logger.info("Done!!")
    return dbname