Class_TaxId integer, OOrder text, Order_TaxId integer, Family text, Family_TaxId integer, Genus text, Genus_TaxId integer, Species text, Species_TaxId integer);""" cursor.execute(query) connection.commit() ############# # Read and store taxids in list of classes store_lineage_info = [] with open(in_res_file) as res: next(res) # skip first line for line in csv.reader(res, delimiter='\t'): match_info = cTaxInfo.TaxInfo() match_info.Lineage = line[5] match_info.Sample = sample_name match_info.RefDatabase = ref_database # Relative abundance rel_abund = line[0].replace("%", "") match_info.Abundance = float(rel_abund) # get Lineage info from NCBI match_info.TaxId = int(line[4]) if match_info.TaxId != 0: try:
def populate_w_tax(in_df, ref_database,species_threshold,genus_threshold, family_threshold,order_threshold,class_threshold,phylum_threshold): #defaults: #species_threshold = 98.41 # Yeast - Vu et al 2016 #genus_threshold = 96.31 # Yeast - Vu et al 2016 #family_threshold = 88.51 # Filamentous fungi - Vu et al 2019 #order_threshold = 81.21 # Filamentous fungi - Vu et al 2019 #class_threshold = 80.91 # Filamentous fungi - Vu et al 2019 #phylum, kingdom and superkingdom = 0 # no data, no filtering # Make sure all taxa columns are strings (doesn't automatically happen if the first one is None) in_df = in_df.assign(LCA_TaxId="",Superkingdom="",Kingdom="",Phylum="",Class="",Order="",Family="",Genus="",Species="") # index == the #template (fungal match) for index, row in in_df.iterrows(): match_info = cTaxInfo.TaxInfo() # define the tax. rank based on similarity: if ref_database == "UNITE": split_match = re.split (r'(\|| )', index) qiden = row['Query_Identity'] match_info.Lineage = split_match[12] # if taxid is knwon: if split_match[4] != 'unk_taxid': match_info.TaxId = int(split_match[4]) match_info = fNCBItax.lineage_extractor(match_info.TaxId , match_info) # Warning about unknown taxids: else: print ("") print ("WARNING: based on accession number, no taxonomic information was found in NCBI for %s" %(match_info.Lineage)) print ("This match will not get NCBItax taxonomic ranks") print ("") match_info.TaxId = split_match[4] # 'unk_taxid' elif ref_database == "RefSeq": split_match = re.split (r'(\|| )', index) qiden = row['Query_Identity'] match_info.TaxId = int(split_match[4]) species = split_match[6] + " " + split_match[8] match_info.Lineage = species # include info from NCBI: match_info = fNCBItax.lineage_extractor(match_info.TaxId, match_info) elif ref_database == "nt": split_match = re.split (r'(\|| )', index) qiden = row['Query_Identity'] match_info.Lineage = split_match[4] + " " + split_match[6] #get taxid from accession number taxid = split_match[0] if taxid == 'unk_taxid': # Warning about unknown taxids: print ("") print ("WARNING: no NCBI's taxid found for accession %s" %(match_info.Lineage)) print ("This match will not get taxonomic ranks") print ("") else: match_info.TaxId = int(taxid) match_info = fNCBItax.lineage_extractor(match_info.TaxId, match_info) # Populate the df with lineage info and the LCA taxid: in_df.at[index, 'Superkingdom'] = match_info.Superkingdom # Assign LCA_taxid. Go to Kingdom if possible: in_df.at[index, 'LCA_TaxId'] = match_info.Superkingdom_TaxId in_df.at[index, 'LCA_TaxId'] = match_info.Kingdom_TaxId # if it matches to uncultured or unclassified fungus, use the Fungi LCA itaxid: if match_info.Kingdom == 'Fungi': in_df.at[index, 'LCA_TaxId'] = 4751 in_df.at[index, 'Kingdom'] = match_info.Kingdom # fill in the rest of the table according to similarity threshold: if qiden >= phylum_threshold: in_df.at[index, 'Phylum'] = match_info.Phylum if match_info.Phylum_TaxId != None: in_df.at[index, 'LCA_TaxId'] = match_info.Phylum_TaxId if qiden >= class_threshold: in_df.at[index, 'Class'] = match_info.Class if match_info.Class_TaxId != None: in_df.at[index, 'LCA_TaxId'] = match_info.Class_TaxId if qiden >= order_threshold: in_df.at[index, 'Order'] = match_info.Order if match_info.Order_TaxId != None: in_df.at[index, 'LCA_TaxId'] = match_info.Order_TaxId if qiden >= family_threshold: in_df.at[index, 'Family'] = match_info.Family if match_info.Family_TaxId != None: in_df.at[index, 'LCA_TaxId'] = match_info.Family_TaxId if qiden >= genus_threshold: in_df.at[index, 'Genus'] = match_info.Genus if match_info.Genus_TaxId != None: in_df.at[index, 'LCA_TaxId'] = match_info.Genus_TaxId if qiden >= species_threshold: in_df.at[index, 'Species'] = match_info.Species if match_info.Species_TaxId != None: in_df.at[index, 'LCA_TaxId'] = match_info.Species_TaxId return in_df
import cTaxInfo # script that define classes used here import fNCBItax # script with function to get lineage from taxid import sqlite3 in_csv_fp = "samples_info.csv" fungi_csv = pd.read_csv(in_csv_fp) sql_fp = "benchm.db" # store info in a list of objects store_fungi_info = [] ###### convert existing to cTaxInfo class abd include NCBI lineage info for key, row in fungi_csv.iterrows(): fungus = cTaxInfo.TaxInfo() fungus.Sample = row['Sample'] fungus.Lineage = row['Lineage'] fungus.TaxId = row['taxid'] fungus.Covergae = row['Coverage'] # include tax info from NCBI: fungus = fNCBItax.lineage_extractor(fungus.TaxId, fungus) store_fungi_info.append(fungus) ###### Add to SQL connection = sqlite3.connect(sql_fp) cursor = connection.cursor()