Esempio n. 1
0
import json
from bin_sample_composition import bin_reads
import glob
import os
from ncbi_taxonomy_utils import ncbi_taxonomy
ncbi = ncbi_taxonomy()
import gzip
import pandas as pd

results_dir = '/data/analysis_group1/idbd_rnd/results/191104_NB551543_0156_AHTLL7AFXY'
batch_file_paths = glob.glob(os.path.join(results_dir, 'batch', '*'))

results = {}
for batch_file in batch_file_paths:
    with open(batch_file) as input:
        batch_obj = json.load(input)
    for lib in batch_obj['libraries']:
        seq_sple = lib['seqSple']
        accession = lib['bioSple']
        print(accession)
        total_reads = 0
        for batch_lib in batch_obj['batch']['readsDist']['DNA']:
            if batch_lib['bioSple'] == accession:
                total_reads = batch_lib['postQualityReads']
                print(total_reads)
        if total_reads == 0:
            continue
        summary_paths = lib['diagnosticOutput']
        composition_path = glob.glob(
            os.path.join(results_dir, 'tax',
                         seq_sple + '*sample_composition.out'))
Esempio n. 2
0
def bin_reads(sample_composition_path,
              ncbi_class=None,
              quantification='relative',
              ctrl_taxids=None):
    with open(sample_composition_path) as file:
        reader = csv.DictReader(file,
                                delimiter='\t',
                                fieldnames=['taxid', 'count'])
        taxid_counts = []
        for entry in reader:
            taxid_counts.append(entry)

    if ncbi_class is None:
        ncbi = ncbi_taxonomy()
    else:
        ncbi = ncbi_class

    human_taxid = 9606
    bacteria_taxid = 2
    virus_taxid = 10239
    eukaryota_taxid = 2759
    parasite_taxids = {
        7563, 188941, 6029, 5653, 6935, 6178, 5794, 6308, 31277, 119088, 6199,
        85819, 33083, 33084, 75966, 41165, 7509, 6236, 198624, 33634, 5988,
        6249, 5738, 1489900, 740972, 1485168, 37104, 10232
    }
    fungus_taxid = 4751
    human_counts = []
    bacteria_counts = []
    virus_counts = []
    ctrl_counts = []
    parasite_counts = []
    fungus_counts = []
    unclassified_counts = []

    for taxid in taxid_counts:
        taxid_int = int(taxid['taxid'])
        taxid_path = ncbi.get_path(taxid_int)
        count_value = taxid['count']
        if isinstance(count_value, int):
            count = count_value
        elif isinstance(count_value, str):
            if len(count_value) > 0:
                count = np.int64(taxid['count'].split('.')[0])
            else:
                continue
        if taxid_int in ctrl_taxids:
            ctrl_counts.append(count)
            continue
        if human_taxid in taxid_path:
            human_counts.append(count)
        elif fungus_taxid in taxid_path:
            fungus_counts.append(count)
        elif parasite_taxids.intersection(set(taxid_path)) != set():
            parasite_counts.append(count)
        elif bacteria_taxid in taxid_path:
            bacteria_counts.append(count)
        elif virus_taxid in taxid_path:
            virus_counts.append(count)
        else:
            unclassified_counts.append(count)

    human_sum = np.array(human_counts).sum()
    bacteria_sum = np.array(bacteria_counts).sum()
    virus_sum = np.array(virus_counts).sum()
    parasite_sum = np.array(parasite_counts).sum()
    fungus_sum = np.array(fungus_counts).sum()
    unclassified_sum = np.array(unclassified_counts).sum()
    ctrl_sum = np.array(ctrl_counts).sum()
    cumalitve_sum = human_sum + bacteria_sum + virus_sum + parasite_sum + \
        fungus_sum + unclassified_sum + ctrl_sum

    if quantification == 'relative':
        count_dict = {
            "Human": 100 * human_sum / cumalitve_sum,
            "Bacteria": 100 * bacteria_sum / cumalitve_sum,
            "Virus": 100 * virus_sum / cumalitve_sum,
            "Parasite": 100 * parasite_sum / cumalitve_sum,
            "Fungus": 100 * fungus_sum / cumalitve_sum,
            "Unclassified": 100 * unclassified_sum / cumalitve_sum,
            "Controls": 100 * ctrl_sum / cumalitve_sum
        }
    elif quantification == 'absolute':
        count_dict = {
            "Human": human_sum,
            "Bacteria": bacteria_sum,
            "Virus": virus_sum,
            "Parasite": parasite_sum,
            "Fungus": fungus_sum,
            "Unclassified": unclassified_sum,
            "Controls": ctrl_sum
        }
    elif quantification == 'both':
        count_dict = {
            "Human": {
                "relative": 100 * human_sum / cumalitve_sum,
                "absolute": human_sum
            },
            "Bacteria": {
                "relative": 100 * bacteria_sum / cumalitve_sum,
                "absolute": bacteria_sum
            },
            "Virus": {
                "relative": 100 * virus_sum / cumalitve_sum,
                "absolute": virus_sum
            },
            "Parasite": {
                "relative": 100 * parasite_sum / cumalitve_sum,
                "absolute": parasite_sum
            },
            "Fungus": {
                "relative": 100 * fungus_sum / cumalitve_sum,
                "absolute": fungus_sum
            },
            "Unclassified": {
                "relative": 100 * unclassified_sum / cumalitve_sum,
                "absolute": unclassified_sum
            },
            "Controls": {
                "relative": 100 * ctrl_sum / cumalitve_sum,
                "absolute": ctrl_sum
            }
        }
    return count_dict