import json from bin_sample_composition import bin_reads import glob import os from ncbi_taxonomy_utils import ncbi_taxonomy ncbi = ncbi_taxonomy() import gzip import pandas as pd results_dir = '/data/analysis_group1/idbd_rnd/results/191104_NB551543_0156_AHTLL7AFXY' batch_file_paths = glob.glob(os.path.join(results_dir, 'batch', '*')) results = {} for batch_file in batch_file_paths: with open(batch_file) as input: batch_obj = json.load(input) for lib in batch_obj['libraries']: seq_sple = lib['seqSple'] accession = lib['bioSple'] print(accession) total_reads = 0 for batch_lib in batch_obj['batch']['readsDist']['DNA']: if batch_lib['bioSple'] == accession: total_reads = batch_lib['postQualityReads'] print(total_reads) if total_reads == 0: continue summary_paths = lib['diagnosticOutput'] composition_path = glob.glob( os.path.join(results_dir, 'tax', seq_sple + '*sample_composition.out'))
def bin_reads(sample_composition_path, ncbi_class=None, quantification='relative', ctrl_taxids=None): with open(sample_composition_path) as file: reader = csv.DictReader(file, delimiter='\t', fieldnames=['taxid', 'count']) taxid_counts = [] for entry in reader: taxid_counts.append(entry) if ncbi_class is None: ncbi = ncbi_taxonomy() else: ncbi = ncbi_class human_taxid = 9606 bacteria_taxid = 2 virus_taxid = 10239 eukaryota_taxid = 2759 parasite_taxids = { 7563, 188941, 6029, 5653, 6935, 6178, 5794, 6308, 31277, 119088, 6199, 85819, 33083, 33084, 75966, 41165, 7509, 6236, 198624, 33634, 5988, 6249, 5738, 1489900, 740972, 1485168, 37104, 10232 } fungus_taxid = 4751 human_counts = [] bacteria_counts = [] virus_counts = [] ctrl_counts = [] parasite_counts = [] fungus_counts = [] unclassified_counts = [] for taxid in taxid_counts: taxid_int = int(taxid['taxid']) taxid_path = ncbi.get_path(taxid_int) count_value = taxid['count'] if isinstance(count_value, int): count = count_value elif isinstance(count_value, str): if len(count_value) > 0: count = np.int64(taxid['count'].split('.')[0]) else: continue if taxid_int in ctrl_taxids: ctrl_counts.append(count) continue if human_taxid in taxid_path: human_counts.append(count) elif fungus_taxid in taxid_path: fungus_counts.append(count) elif parasite_taxids.intersection(set(taxid_path)) != set(): parasite_counts.append(count) elif bacteria_taxid in taxid_path: bacteria_counts.append(count) elif virus_taxid in taxid_path: virus_counts.append(count) else: unclassified_counts.append(count) human_sum = np.array(human_counts).sum() bacteria_sum = np.array(bacteria_counts).sum() virus_sum = np.array(virus_counts).sum() parasite_sum = np.array(parasite_counts).sum() fungus_sum = np.array(fungus_counts).sum() unclassified_sum = np.array(unclassified_counts).sum() ctrl_sum = np.array(ctrl_counts).sum() cumalitve_sum = human_sum + bacteria_sum + virus_sum + parasite_sum + \ fungus_sum + unclassified_sum + ctrl_sum if quantification == 'relative': count_dict = { "Human": 100 * human_sum / cumalitve_sum, "Bacteria": 100 * bacteria_sum / cumalitve_sum, "Virus": 100 * virus_sum / cumalitve_sum, "Parasite": 100 * parasite_sum / cumalitve_sum, "Fungus": 100 * fungus_sum / cumalitve_sum, "Unclassified": 100 * unclassified_sum / cumalitve_sum, "Controls": 100 * ctrl_sum / cumalitve_sum } elif quantification == 'absolute': count_dict = { "Human": human_sum, "Bacteria": bacteria_sum, "Virus": virus_sum, "Parasite": parasite_sum, "Fungus": fungus_sum, "Unclassified": unclassified_sum, "Controls": ctrl_sum } elif quantification == 'both': count_dict = { "Human": { "relative": 100 * human_sum / cumalitve_sum, "absolute": human_sum }, "Bacteria": { "relative": 100 * bacteria_sum / cumalitve_sum, "absolute": bacteria_sum }, "Virus": { "relative": 100 * virus_sum / cumalitve_sum, "absolute": virus_sum }, "Parasite": { "relative": 100 * parasite_sum / cumalitve_sum, "absolute": parasite_sum }, "Fungus": { "relative": 100 * fungus_sum / cumalitve_sum, "absolute": fungus_sum }, "Unclassified": { "relative": 100 * unclassified_sum / cumalitve_sum, "absolute": unclassified_sum }, "Controls": { "relative": 100 * ctrl_sum / cumalitve_sum, "absolute": ctrl_sum } } return count_dict