def __init__(self, sample_composition_path, ncbi_tax=None, ctrl_taxa=None): if ncbi_tax is None: self.ncbi_tax = NcbiTaxonomy() else: self.ncbi_tax = ncbi_tax self._validate_ctrl_taxa(ctrl_taxa) self.composition_dict = self._load_sample_composition( sample_composition_path) self.total_reads = np.sum(list(self.composition_dict.values())) self.organism_counts = self._get_organism_counts()
def __init__(self, sample_composition_path, ncbi_tax=None, ctrl_taxa=None): if ncbi_tax is None: self.ncbi_tax = NcbiTaxonomy() else: self.ncbi_tax = ncbi_tax if isinstance(ctrl_taxa, list): if not all([isinstance(taxid, int) for taxid in ctrl_taxa]): raise ValueError('ctrl_taxa must be int or list of ints.') self.ctrl_taxa = ctrl_taxa elif isinstance(ctrl_taxa, int): self.ctrl_taxa = list(ctrl_taxa) else: raise ValueError('ctrl_taxa must be int or list of ints.') self.composition_dict = self._load_sample_composition( sample_composition_path) self.total_reads = np.sum(list(self.composition_dict.values())) self.organism_counts = self._get_organism_counts()
def get_quant(sample_info, panel_orgs, coverages): # matrix = np.zeros(len(seq_sple_ls), len(panel_orgs)) matrix_ls = [] for seq_sple, comp_path in tqdm(zip(sample_info['seq_sple'], sample_info['dna_sample_comp_path'])): sample_ls = [] parser = SampleCompParser(comp_path, ncbi_tax=ncbi) for taxid in panel_orgs['taxid']: nr = parser.get_taxid_nr(taxid, normalizer=1) sample_ls.append(nr) matrix_ls.append(np.array(sample_ls)) return np.array(matrix_ls) ncbi = NcbiTaxonomy() panel_orgs = pd.read_csv('org_taxids_uti_all_with_new.txt', sep='\t') samples_arup = pd.read_csv('arup_sample_info.txt', sep='\t') samples_syn = pd.read_csv('synergy_clinical_sample_info.txt', sep='\t') cov_arup = pd.read_csv('arup_coverages.csv', index_col=2) cov_arup_fp = pd.read_csv('arup_coverages_fungpar.csv', index_col=2) cov_syn = pd.read_csv('synergy_coverages.csv', index_col=2) cov_syn_fp = pd.read_csv('synergy_coverages_fungpar.csv', index_col=2) merged_arup = pd.concat([cov_arup, cov_arup_fp]) merged_syn = pd.concat([cov_syn, cov_syn_fp]) quant_arup = get_quant(samples_arup, panel_orgs, merged_arup) quant_arup_df = pd.DataFrame(data=quant_arup, columns=panel_orgs['name'],
class SampleCompParser(object): def __init__(self, sample_composition_path, ncbi_tax=None, ctrl_taxa=None): if ncbi_tax is None: self.ncbi_tax = NcbiTaxonomy() else: self.ncbi_tax = ncbi_tax self._validate_ctrl_taxa(ctrl_taxa) self.composition_dict = self._load_sample_composition( sample_composition_path) self.total_reads = np.sum(list(self.composition_dict.values())) self.organism_counts = self._get_organism_counts() def _validate_ctrl_taxa(self, ctrl_taxa): if isinstance(ctrl_taxa, list): if not all([isinstance(taxid, int) for taxid in ctrl_taxa]): raise ValueError('ctrl_taxa must be int or list of ints.') self.ctrl_taxa = ctrl_taxa elif isinstance(ctrl_taxa, int): self.ctrl_taxa = [ctrl_taxa] elif ctrl_taxa is None: self.ctrl_taxa = [ctrl_taxa] else: raise ValueError('ctrl_taxa must be int or list of ints.') def _validate_taxid(self, taxid): if not isinstance(taxid, int): try: return int(taxid) except: raise ValueError(f'taxid must be int, received {type(taxid)}') else: return taxid def _load_sample_composition(self, sample_composition_path): with open(sample_composition_path) as infile: composition_dict = {} for line in infile: data = line.strip().split('\t') composition_dict.update({int(data[0]): int(data[1])}) return composition_dict def _get_organism_counts(self): human_taxid = 9606 bacteria_taxid = 2 virus_taxid = 10239 eukaryota_taxid = 2759 parasite_taxids = { 7563, 188941, 6029, 5653, 6935, 6178, 5794, 6308, 31277, 119088, 6199, 85819, 33083, 33084, 75966, 41165, 7509, 6236, 198624, 33634, 5988, 6249, 5738, 1489900, 740972, 1485168, 37104, 10232 } fungus_taxid = 4751 human_count = 0 bacteria_count = 0 virus_count = 0 parasite_count = 0 fungus_count = 0 unclassified_count = 0 for taxid, count in self.composition_dict.items(): if taxid in self.ctrl_taxa: continue taxid_path = self.ncbi_tax.get_path(taxid) if human_taxid in taxid_path: human_count += count elif fungus_taxid in taxid_path: fungus_count += count elif parasite_taxids.intersection(set(taxid_path)) != set(): parasite_count += count elif bacteria_taxid in taxid_path: bacteria_count += count elif virus_taxid in taxid_path: virus_count += count else: unclassified_count += count count_dict = { 'human': human_count, 'bacteria': bacteria_count, 'virus': virus_count, 'parasite': parasite_count, 'fungus': fungus_count, 'unclassified': unclassified_count } return count_dict def get_total_reads(self): return self.total_reads def get_taxid_reads(self, taxid): taxid = self._validate_taxid(taxid) if taxid not in self.composition_dict: reads = 0 else: reads = self.composition_dict[taxid] return reads def get_taxid_nr(self, taxid, normalizer=1e7): taxid = self._validate_taxid(taxid) if taxid not in self.composition_dict: nr = 0 else: nr = normalizer * self.composition_dict[taxid] / self.total_reads return nr def get_genus_nr(self, genus_taxid, normalizer=1e7): genus_taxid = self._validate_taxid(genus_taxid) children = self.ncbi_tax.get_children(genus_taxid) + [genus_taxid] intersection = set(children).intersection( set(list(self.composition_dict.values()))) reads = 0 for taxid in intersection: if taxid in self.composition_dict: reads += self.composition_dict[taxid] # nr = normalizer * reads / self.total_reads nr = reads return nr def get_org_comp_abs(self): return self.organism_counts def get_org_comp_rel(self): count_dict = {} for org, count in self.organism_counts.items(): count_dict.update({org: 100 * count / self.total_reads}) return count_dict def get_org_comp_nr(self, normalizer=1e7): count_dict = {} for org, count in self.organism_counts.items(): count_dict.update({org: normalizer * count / self.total_reads}) return count_dict
def bin_reads(sample_composition_path, ncbi_class=None, quantification='relative', ctrl_taxids=None): with open(sample_composition_path) as file: reader = csv.DictReader(file, delimiter='\t', fieldnames=['taxid', 'count']) taxid_counts = [] for entry in reader: taxid_counts.append(entry) if ncbi_class is None: ncbi = NcbiTaxonomy() else: ncbi = ncbi_class human_taxid = 9606 bacteria_taxid = 2 virus_taxid = 10239 eukaryota_taxid = 2759 parasite_taxids = { 7563, 188941, 6029, 5653, 6935, 6178, 5794, 6308, 31277, 119088, 6199, 85819, 33083, 33084, 75966, 41165, 7509, 6236, 198624, 33634, 5988, 6249, 5738, 1489900, 740972, 1485168, 37104, 10232 } fungus_taxid = 4751 human_counts = [] bacteria_counts = [] virus_counts = [] parasite_counts = [] fungus_counts = [] unclassified_counts = [] for taxid in taxid_counts: if taxid in ctrl_taxids: continue taxid_path = ncbi.get_path(int(taxid['taxid'])) count_value = taxid['count'] if isinstance(count_value, int): count = count_value elif isinstance(count_value, str): if len(count_value) > 0: count = np.int64(taxid['count'].split('.')[0]) else: continue if human_taxid in taxid_path: human_counts.append(count) elif fungus_taxid in taxid_path: fungus_counts.append(count) elif parasite_taxids.intersection(set(taxid_path)) != set(): parasite_counts.append(count) elif bacteria_taxid in taxid_path: bacteria_counts.append(count) elif virus_taxid in taxid_path: virus_counts.append(count) else: unclassified_counts.append(count) human_sum = np.array(human_counts).sum() bacteria_sum = np.array(bacteria_counts).sum() virus_sum = np.array(virus_counts).sum() parasite_sum = np.array(parasite_counts).sum() fungus_sum = np.array(fungus_counts).sum() unclassified_sum = np.array(unclassified_counts).sum() cumalitve_sum = human_sum + bacteria_sum + virus_sum + parasite_sum + \ fungus_sum + unclassified_sum if quantification == 'relative': count_dict = { "Human": 100 * human_sum / cumalitve_sum, "Bacteria": 100 * bacteria_sum / cumalitve_sum, "Virus": 100 * virus_sum / cumalitve_sum, "Parasite": 100 * parasite_sum / cumalitve_sum, "Fungus": 100 * fungus_sum / cumalitve_sum, "Unclassified": 100 * unclassified_sum / cumalitve_sum } elif quantification == 'absolute': count_dict = { "Human": human_sum, "Bacteria": bacteria_sum, "Virus": virus_sum, "Parasite": parasite_sum, "Fungus": fungus_sum, "Unclassified": unclassified_sum } elif quantification == 'both': count_dict = { "Human": { "relative": 100 * human_sum / cumalitve_sum, "absolute": human_sum }, "Bacteria": { "relative": 100 * bacteria_sum / cumalitve_sum, "absolute": bacteria_sum }, "Virus": { "relative": 100 * virus_sum / cumalitve_sum, "absolute": virus_sum }, "Parasite": { "relative": 100 * parasite_sum / cumalitve_sum, "absolute": parasite_sum }, "Fungus": { "relative": 100 * fungus_sum / cumalitve_sum, "absolute": fungus_sum }, "Unclassified": { "relative": 100 * unclassified_sum / cumalitve_sum, "absolute": unclassified_sum }, } return count_dict