Example #1
0
 def join_by_value_columns(tables: list, index_col_name: str,
                           value_col_name_: str):
     dfs_list = [
         Utilities.load_tsv(i).set_index(index_col_name)
         [value_col_name_].rename(i) for i in tables
     ]
     out = pd.concat(dfs_list, axis=1, sort=False).sort_index()
     out.index.names = [index_col_name]
     return out
Example #2
0
 def join_and_annotate(self):
     annotation_df = Utilities.load_tsv(
         self.annotation_file).set_index(REFERENCE_COL_NAME)
     values_df = self.join_by_value_columns(self.coverage_files,
                                            REFERENCE_COL_NAME,
                                            self.value_col_name)
     out = pd.concat([annotation_df, values_df], axis=1, sort=False)
     out.index.names = [REFERENCE_COL_NAME]
     return out
 def annotate(self):
     self._raw_nfasta_df = Utilities.load_tsv(self.annotation_file)
     raw_nfasta_headers = self._raw_nfasta_df["former_id"].values.tolist()
     processed_nfasta_headers = [
         Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(
             self._mp_parse_nfasta_header, raw_nfasta_headers)
     ]
     self._processed_nfasta_df = Utilities.merge_pd_series_list(
         processed_nfasta_headers).sort_values("former_id")
     zf_len = len(max(self._processed_nfasta_df["vfdb_id"].values.tolist()))
     # Join table assembled from pFASTA headers
     raw_pfasta_headers = []
     with open(self._raw_pfasta_file, mode="r", encoding="utf-8") as _f:
         for _line in _f:
             if _line.startswith(">"):
                 raw_pfasta_headers.append(re.sub("^>", "", _line).strip())
         _f.close()
     raw_pfasta_headers = sorted(
         set([i for i in raw_pfasta_headers if len(i) > 0]))
     processed_pfasta_headers = [
         Utilities.dict2pd_series(i) for i in Utilities.multi_core_queue(
             self._mp_parse_pfasta_header, raw_pfasta_headers)
     ]
     self._processed_pfasta_df = Utilities.merge_pd_series_list(
         processed_pfasta_headers).sort_values("protein_header")
     self._processed_pfasta_df["vfdb_id"] = self._processed_pfasta_df[
         "vfdb_id"].str.zfill(zf_len)
     # Join provided table. Note the table file placed into the same dir with the merged protein FASTA file
     vfs_table_file = os.path.join(os.path.dirname(self._raw_pfasta_file),
                                   "VFs.xls")
     vfs_df = pd.read_excel(vfs_table_file, sheet_name="VFs",
                            header=1).fillna("")
     vfs_df["vfdb_id"] = vfs_df["VFID"].str.extract("VF(\d+)")[0].str.zfill(
         zf_len)
     self.merged_df = pd.concat([
         i.set_index("vfdb_id").sort_index() for i in
         [self._processed_nfasta_df, self._processed_pfasta_df, vfs_df]
     ],
                                axis=1,
                                sort=False).sort_index()
     self.merged_df.index.names = ["vfdb_id"]
     self.merged_df = self.merged_df.loc[
         self.merged_df["former_id"].str.len() > 0].reset_index()
     self.merged_df = Utilities.left_merge(self._raw_nfasta_df,
                                           self.merged_df, "former_id")
Example #4
0
    _df = df.copy()
    if capitalize:
        _df.rename(columns={i: i[0].upper() + i[1:]
                            for i in _df.columns},
                   inplace=True)
    return _df.rename(columns={i: i.replace("_", " ") for i in _df.columns})


data_dir = "./inicolaeva/klebsiella_infants/datasets"
article_dir = os.path.join(ProjectDescriber.DATA_DIGEST_DIR, "article")
INDEX_COL_NAME = "sample_name"

antibacterial_agents = ['klebsiella_phage', 'pyo_bacteriophage']
initial_sample_data_df = Utilities.load_tsv(
    os.path.join(data_dir, "initial_sample_data.tsv")
).loc[:, [INDEX_COL_NAME] + [
    'sample_number', 'delivery', 'patient_id', 'checkpoint_age_days',
    'checkpoint_kpneumoniae_lg_cfu_per_g', 'extended-spectrum_beta-lactamases'
] + antibacterial_agents].set_index(INDEX_COL_NAME).sort_index()
initial_sample_data_df["delivery"].replace({
    "vaginal": "V",
    "caesarean": "C"
},
                                           inplace=True)
initial_sample_data_df["extended-spectrum_beta-lactamases"].replace(
    {
        True: "+",
        False: "-"
    }, inplace=True)
for antibacterial_agent in antibacterial_agents:
    initial_sample_data_df[antibacterial_agent].replace(
        {
Example #5
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

#%%

import os
import pandas as pd
from meta.scripts.Utilities import Utilities

#%%

sra_dir = "/data1/bio/projects/vradchenko/lactobacillus_salivarius/sra"
sra_df = Utilities.load_tsv(os.path.join(sra_dir, "sra.tsv"))

queue = [{
    "func": Utilities.count_reads_statistics,
    "kwargs": {
        "reads_file": i,
        "type_": "fastq_gz"
    }
} for i in Utilities.scan_whole_dir(os.path.join(sra_dir, "reads"))]

raw_reads_base_stats = Utilities.multi_core_queue(Utilities.wrapper,
                                                  queue,
                                                  async_=True)

#%%

raw_reads_base_stat_df = pd.DataFrame(raw_reads_base_stats)
raw_reads_base_stat_df["reads_file"] = raw_reads_base_stat_df[
    "reads_file"].apply(os.path.basename)
Example #6
0
for sra_regular_col_name, sra_regular_value in zip([
        "library_strategy", "library_source", "library_selection",
        "library_layout", "platform", "instrument_model", "filetype"
], [
        "WGS", "GENOMIC", "RANDOM", "paired", "ILLUMINA", "Illumina MiSeq",
        "fastq"
]):
    sra_df[sra_regular_col_name] = sra_regular_value

sra_df["design_description"] = raw_sampledata_df["R1"].apply(
    lambda x: os.path.dirname(x).split("_")[-1])
sra_df["library_ID"] = sra_df["sample_name"]
sra_df.set_index("sample_name", inplace=True)

submission_report_df = Utilities.load_tsv(
    "https://raw.githubusercontent.com/ivasilyev/curated_projects/master/vradchenko/lactobacillus_salivarius/data/tables/ncbi/submission_report.tsv"
).set_index("sample_name")
sra_df = pd.concat(
    [sra_df, submission_report_df.loc[:, ["BioSample", "BioProject"]]],
    axis=1,
    sort=False)
sra_df.rename(columns={
    "BioSample": "biosample_accession",
    "BioProject": "bioproject_accession"
},
              inplace=True)

biosample_attributes_df = Utilities.load_tsv(
    "https://raw.githubusercontent.com/ivasilyev/curated_projects/master/vradchenko/lactobacillus_salivarius/data/tables/ncbi/biosample_attributes_microbe.tsv"
).set_index("*sample_name")
sra_df = pd.concat([
    subprocess.getoutput("rm -rf {}".format(out_dir))
    os.makedirs(out_dir)
    cmd = "spades.py --careful -o {out} -1 {i1} -2 {i2}".format(
        out=out_dir, i1=sample_file_1, i2=sample_file_2)
    log = subprocess.getoutput(cmd)
    log_file = os.path.join(out_dir, "{}_spades.log".format(sample_name))
    Utilities.dump_string(log, file=log_file)
    return {
        "sample_name": sample_name,
        "assembly": os.path.join(out_dir, "contigs.fasta")
    }


projectDescriber = ProjectDescriber()
rawSampledataDF = Utilities.load_tsv(
    "/data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/raw_reads.sampledata"
)
# Prepare path
rawReadsDir = os.path.join(projectDescriber.RAW_DATA_DIR, "reads")
cutadaptDir = os.path.join(rawReadsDir, "cutadapt")
os.makedirs(cutadaptDir, exist_ok=True)
# Trim reads
cutadaptResults = Utilities.multi_core_queue(
    run_cutadapt, queue=rawSampledataDF.values.tolist())
cutadaptResultsDF = pd.DataFrame.from_dict(cutadaptResults).sort_values(
    "sample_name")
Utilities.dump_tsv(
    cutadaptResultsDF,
    table_file=projectDescriber.SAMPLE_DATA_FILE,
    col_names=["sample_name", "trimmed_file_1", "trimmed_file_2"])
# Assemble reads
            a=idx + 1, b=ORGANISM, c=ISOLATE_PREFIX, d=sample_number)
        if seq_record_processed.description.endswith(" PLASMID"):
            plasmid_counter += 1
            seq_record_processed.description = "[plasmid-name=unnamed{0:02d}]".format(
                plasmid_counter)
        else:
            seq_record_processed.description = ""
    assemblies_annotations.append(assemblies_annotation)
    #
    SeqIO.write(seq_records_processed, assembly_target_file, "fasta")

INDEX_COL_NAME = "sample_name"
assemblies_statistics_df = pd.DataFrame(assemblies_annotations).set_index(
    INDEX_COL_NAME)
reads_statistics_file = "/data1/bio/projects/inicolaeva/klebsiella_infants/sample_data/reads_statistics.tsv"
reads_statistics_df = Utilities.load_tsv(reads_statistics_file).set_index(
    INDEX_COL_NAME)
combined_statistics_df = pd.concat(
    [reads_statistics_df, assemblies_statistics_df], axis=1, sort=False)
combined_statistics_df.index.names = [INDEX_COL_NAME]
numeric_col_names = [
    i for i in combined_statistics_df.columns
    if any(j in i for j in ("_assembly_contigs_", "_assembly_bp_"))
]
combined_statistics_df.fillna(0, inplace=True)
combined_statistics_df = combined_statistics_df.astype(
    {i: int
     for i in numeric_col_names})
# From NCBI template ('Template_GenomeBatch.11700383121d.xlsx'):
# The estimated base coverage across the genome, eg 12x.
# This can be calculated by dividing the number of bases sequenced by the expected genome size
# and multiplying that by the percentage of bases that were placed in the final assembly.
Example #9
0
def count_fasta_statistics(fasta_file: str, sample_name: str = None):
    from Bio import SeqIO
    with open(fasta_file, mode="r", encoding="utf-8") as f:
        seq_records = list(SeqIO.parse(f, "fasta"))
        f.close()
    out = dict(fasta_file=fasta_file,
               fasta_sequences_number=len(seq_records),
               fasta_total_bp=sum([len(i) for i in seq_records]))
    if sample_name:
        out["sample_name"] = sample_name
    return out


# Process assemblies
blasted_data_df = Utilities.load_tsv(
    os.path.join(ProjectDescriber.SAMPLE_DATA_DIR, "BLASTed.sampledata"))
blasted_data_df["organism"] = blasted_data_df["strain"].apply(
    lambda x: " ".join(x.split(" ")[:2]))

blasted_data_df.rename(columns={
    i: "reference_{}".format(i)
    for i in blasted_data_df.columns
    if all(j not in i for j in ["assembly", "reference", "sample"])
},
                       inplace=True)
assembly_files = blasted_data_df["assembly_file"].values.tolist()

assembly_stats_df = pd.DataFrame(
    Utilities.multi_core_queue(Utilities.count_assembly_statistics,
                               assembly_files))
assembly_stats_df.rename(
Example #10
0
TOOL_VERSIONS = dict(
    fastqc_version="quay.io/biocontainers/fastqc:0.11.8--1",
    trimmomatic_version="quay.io/biocontainers/trimmomatic:0.39--1",
    cutadapt_version="quay.io/biocontainers/cutadapt:2.5--py37h516909a_0",
    bowtie2_version="quay.io/biocontainers/bowtie2:2.3.5--py37he860b03_0",
    spades_version="quay.io/biocontainers/spades:3.9.1--0")

templates_dir = os.path.join(ProjectDescriber.ROOT_DIR, "reports", "1")
template = jinja2.Template(
    Utilities.load_string(os.path.join(templates_dir, "template.txt")))

for sample_name in SAMPLE_NAMES:
    # sample_name = SAMPLE_NAMES[0]
    #
    combined_assembly_statistics_df = Utilities.load_tsv(
        os.path.join(".", ProjectDescriber.OWNER, ProjectDescriber.NAME,
                     "data", "tables", "combined_assembly_statistics.tsv"))
    submission_report_df = Utilities.load_tsv(
        os.path.join(".", ProjectDescriber.OWNER, ProjectDescriber.NAME,
                     "data", "tables", "ncbi", "submission_report.tsv"))
    #
    submission_combined_df = pd.concat([
        i.set_index(INDEX_COL_NAME)
        for i in (combined_assembly_statistics_df, submission_report_df)
    ],
                                       axis=1,
                                       sort=False)
    submission_combined_df.index.names = [INDEX_COL_NAME]
    #
    rendering_dict = submission_combined_df.loc[sample_name, :].to_dict()
    rendering_dict.update(TOOL_VERSIONS)