コード例 #1
0
 def __init__(self, sample_composition_path, ncbi_tax=None,
         ctrl_taxa=None):
     if ncbi_tax is None:
         self.ncbi_tax = NcbiTaxonomy()
     else:
         self.ncbi_tax = ncbi_tax
     self._validate_ctrl_taxa(ctrl_taxa)
     self.composition_dict = self._load_sample_composition(
         sample_composition_path)
     self.total_reads = np.sum(list(self.composition_dict.values()))
     self.organism_counts = self._get_organism_counts()
コード例 #2
0
 def __init__(self, sample_composition_path, ncbi_tax=None, ctrl_taxa=None):
     if ncbi_tax is None:
         self.ncbi_tax = NcbiTaxonomy()
     else:
         self.ncbi_tax = ncbi_tax
     if isinstance(ctrl_taxa, list):
         if not all([isinstance(taxid, int) for taxid in ctrl_taxa]):
             raise ValueError('ctrl_taxa must be int or list of ints.')
         self.ctrl_taxa = ctrl_taxa
     elif isinstance(ctrl_taxa, int):
         self.ctrl_taxa = list(ctrl_taxa)
     else:
         raise ValueError('ctrl_taxa must be int or list of ints.')
     self.composition_dict = self._load_sample_composition(
         sample_composition_path)
     self.total_reads = np.sum(list(self.composition_dict.values()))
     self.organism_counts = self._get_organism_counts()
コード例 #3
0
def get_quant(sample_info, panel_orgs, coverages):
    # matrix = np.zeros(len(seq_sple_ls), len(panel_orgs))
    matrix_ls = []
    for seq_sple, comp_path in tqdm(zip(sample_info['seq_sple'],
            sample_info['dna_sample_comp_path'])):
        sample_ls = []
        parser = SampleCompParser(comp_path, ncbi_tax=ncbi)
        for taxid in panel_orgs['taxid']:
            nr = parser.get_taxid_nr(taxid, normalizer=1)
            sample_ls.append(nr)
        matrix_ls.append(np.array(sample_ls))
    return np.array(matrix_ls)


ncbi = NcbiTaxonomy()

panel_orgs = pd.read_csv('org_taxids_uti_all_with_new.txt', sep='\t')
samples_arup = pd.read_csv('arup_sample_info.txt', sep='\t')
samples_syn = pd.read_csv('synergy_clinical_sample_info.txt', sep='\t')

cov_arup = pd.read_csv('arup_coverages.csv', index_col=2)
cov_arup_fp = pd.read_csv('arup_coverages_fungpar.csv', index_col=2)
cov_syn = pd.read_csv('synergy_coverages.csv', index_col=2)
cov_syn_fp = pd.read_csv('synergy_coverages_fungpar.csv', index_col=2)

merged_arup = pd.concat([cov_arup, cov_arup_fp])
merged_syn = pd.concat([cov_syn, cov_syn_fp])

quant_arup = get_quant(samples_arup, panel_orgs, merged_arup)
quant_arup_df = pd.DataFrame(data=quant_arup, columns=panel_orgs['name'],
コード例 #4
0
class SampleCompParser(object):
    def __init__(self, sample_composition_path, ncbi_tax=None,
            ctrl_taxa=None):
        if ncbi_tax is None:
            self.ncbi_tax = NcbiTaxonomy()
        else:
            self.ncbi_tax = ncbi_tax
        self._validate_ctrl_taxa(ctrl_taxa)
        self.composition_dict = self._load_sample_composition(
            sample_composition_path)
        self.total_reads = np.sum(list(self.composition_dict.values()))
        self.organism_counts = self._get_organism_counts()

    def _validate_ctrl_taxa(self, ctrl_taxa):
        if isinstance(ctrl_taxa, list):
            if not all([isinstance(taxid, int) for taxid in ctrl_taxa]):
                raise ValueError('ctrl_taxa must be int or list of ints.')
            self.ctrl_taxa = ctrl_taxa
        elif isinstance(ctrl_taxa, int):
            self.ctrl_taxa = [ctrl_taxa]
        elif ctrl_taxa is None:
            self.ctrl_taxa = [ctrl_taxa]
        else:
            raise ValueError('ctrl_taxa must be int or list of ints.')

    def _validate_taxid(self, taxid):
        if not isinstance(taxid, int):
            try:
                return int(taxid)
            except:
                raise ValueError(f'taxid must be int, received {type(taxid)}')
        else:
            return taxid

    def _load_sample_composition(self, sample_composition_path):
        with open(sample_composition_path) as infile:
            composition_dict = {}
            for line in infile:
                data = line.strip().split('\t')
                composition_dict.update({int(data[0]): int(data[1])})
        return composition_dict

    def _get_organism_counts(self):
        human_taxid = 9606
        bacteria_taxid = 2
        virus_taxid = 10239
        eukaryota_taxid = 2759
        parasite_taxids = {
            7563,
            188941,
            6029,
            5653,
            6935,
            6178,
            5794,
            6308,
            31277,
            119088,
            6199,
            85819,
            33083,
            33084,
            75966,
            41165,
            7509,
            6236,
            198624,
            33634,
            5988,
            6249,
            5738,
            1489900,
            740972,
            1485168,
            37104,
            10232
        }
        fungus_taxid = 4751

        human_count = 0
        bacteria_count = 0
        virus_count = 0
        parasite_count = 0
        fungus_count = 0
        unclassified_count = 0
        for taxid, count in self.composition_dict.items():
            if taxid in self.ctrl_taxa:
                continue
            taxid_path = self.ncbi_tax.get_path(taxid)
            if human_taxid in taxid_path:
                human_count += count
            elif fungus_taxid in taxid_path:
                fungus_count += count
            elif parasite_taxids.intersection(set(taxid_path)) != set():
                parasite_count += count
            elif bacteria_taxid in taxid_path:
                bacteria_count += count
            elif virus_taxid in taxid_path:
                virus_count += count
            else:
                unclassified_count += count
        count_dict = {
            'human': human_count,
            'bacteria': bacteria_count,
            'virus': virus_count,
            'parasite': parasite_count,
            'fungus': fungus_count,
            'unclassified': unclassified_count
        }
        return count_dict

    def get_total_reads(self):
        return self.total_reads

    def get_taxid_reads(self, taxid):
        taxid = self._validate_taxid(taxid)
        if taxid not in self.composition_dict:
            reads = 0
        else:
            reads = self.composition_dict[taxid]
        return reads

    def get_taxid_nr(self, taxid, normalizer=1e7):
        taxid = self._validate_taxid(taxid)
        if taxid not in self.composition_dict:
            nr = 0
        else:
            nr = normalizer * self.composition_dict[taxid] / self.total_reads
        return nr

    def get_genus_nr(self, genus_taxid, normalizer=1e7):
        genus_taxid = self._validate_taxid(genus_taxid)
        children = self.ncbi_tax.get_children(genus_taxid) + [genus_taxid]
        intersection = set(children).intersection(
            set(list(self.composition_dict.values())))
        reads = 0
        for taxid in intersection:
            if taxid in self.composition_dict:
                reads += self.composition_dict[taxid]
        # nr = normalizer * reads / self.total_reads
        nr = reads
        return nr

    def get_org_comp_abs(self):
        return self.organism_counts

    def get_org_comp_rel(self):
        count_dict = {}
        for org, count in self.organism_counts.items():
            count_dict.update({org: 100 * count / self.total_reads})
        return count_dict

    def get_org_comp_nr(self, normalizer=1e7):
        count_dict = {}
        for org, count in self.organism_counts.items():
            count_dict.update({org: normalizer * count / self.total_reads})
        return count_dict
コード例 #5
0
def bin_reads(sample_composition_path,
              ncbi_class=None,
              quantification='relative',
              ctrl_taxids=None):
    with open(sample_composition_path) as file:
        reader = csv.DictReader(file,
                                delimiter='\t',
                                fieldnames=['taxid', 'count'])
        taxid_counts = []
        for entry in reader:
            taxid_counts.append(entry)

    if ncbi_class is None:
        ncbi = NcbiTaxonomy()
    else:
        ncbi = ncbi_class

    human_taxid = 9606
    bacteria_taxid = 2
    virus_taxid = 10239
    eukaryota_taxid = 2759
    parasite_taxids = {
        7563, 188941, 6029, 5653, 6935, 6178, 5794, 6308, 31277, 119088, 6199,
        85819, 33083, 33084, 75966, 41165, 7509, 6236, 198624, 33634, 5988,
        6249, 5738, 1489900, 740972, 1485168, 37104, 10232
    }
    fungus_taxid = 4751
    human_counts = []
    bacteria_counts = []
    virus_counts = []
    parasite_counts = []
    fungus_counts = []
    unclassified_counts = []

    for taxid in taxid_counts:
        if taxid in ctrl_taxids:
            continue
        taxid_path = ncbi.get_path(int(taxid['taxid']))
        count_value = taxid['count']
        if isinstance(count_value, int):
            count = count_value
        elif isinstance(count_value, str):
            if len(count_value) > 0:
                count = np.int64(taxid['count'].split('.')[0])
            else:
                continue
        if human_taxid in taxid_path:
            human_counts.append(count)
        elif fungus_taxid in taxid_path:
            fungus_counts.append(count)
        elif parasite_taxids.intersection(set(taxid_path)) != set():
            parasite_counts.append(count)
        elif bacteria_taxid in taxid_path:
            bacteria_counts.append(count)
        elif virus_taxid in taxid_path:
            virus_counts.append(count)
        else:
            unclassified_counts.append(count)

    human_sum = np.array(human_counts).sum()
    bacteria_sum = np.array(bacteria_counts).sum()
    virus_sum = np.array(virus_counts).sum()
    parasite_sum = np.array(parasite_counts).sum()
    fungus_sum = np.array(fungus_counts).sum()
    unclassified_sum = np.array(unclassified_counts).sum()
    cumalitve_sum = human_sum + bacteria_sum + virus_sum + parasite_sum + \
        fungus_sum + unclassified_sum

    if quantification == 'relative':
        count_dict = {
            "Human": 100 * human_sum / cumalitve_sum,
            "Bacteria": 100 * bacteria_sum / cumalitve_sum,
            "Virus": 100 * virus_sum / cumalitve_sum,
            "Parasite": 100 * parasite_sum / cumalitve_sum,
            "Fungus": 100 * fungus_sum / cumalitve_sum,
            "Unclassified": 100 * unclassified_sum / cumalitve_sum
        }
    elif quantification == 'absolute':
        count_dict = {
            "Human": human_sum,
            "Bacteria": bacteria_sum,
            "Virus": virus_sum,
            "Parasite": parasite_sum,
            "Fungus": fungus_sum,
            "Unclassified": unclassified_sum
        }
    elif quantification == 'both':
        count_dict = {
            "Human": {
                "relative": 100 * human_sum / cumalitve_sum,
                "absolute": human_sum
            },
            "Bacteria": {
                "relative": 100 * bacteria_sum / cumalitve_sum,
                "absolute": bacteria_sum
            },
            "Virus": {
                "relative": 100 * virus_sum / cumalitve_sum,
                "absolute": virus_sum
            },
            "Parasite": {
                "relative": 100 * parasite_sum / cumalitve_sum,
                "absolute": parasite_sum
            },
            "Fungus": {
                "relative": 100 * fungus_sum / cumalitve_sum,
                "absolute": fungus_sum
            },
            "Unclassified": {
                "relative": 100 * unclassified_sum / cumalitve_sum,
                "absolute": unclassified_sum
            },
        }
    return count_dict