def lastReactions(genotype): kos = [pos_to_gene_map[i] for i in np.where((genotype > 0.5) * 1)[0]] refDF = pd.read_csv('KOREF.txt', delimiter='\t', sep='delimiter', header=None, names=['id', 'rid']) def give_number(string): return int(string[-5:]) refDF['id'] = refDF['id'].apply(give_number) refDF['rid'] = refDF['rid'].apply(give_number) kos_to_rxns = refDF.loc[refDF['id'].isin(kos), :] can = refDF.loc[refDF['id'].isin(kos), :]['rid'].values can = ['R' + (5 - len(str(int(e)))) * '0' + str(int(e)) for e in can] can = list((set(can)) & set(rxns)) avrxns = np.array([rxn_kegg_to_id[e] for e in can]) avrxns = np.array([rxn_kegg_to_id[e] for e in can]) avScopeRxn = giveScope(rxnMat[avrxns], prodMat[avrxns], seedVec, sumRxnVec[avrxns])[1] scopeRxns = np.nonzero(avrxns * avScopeRxn)[0] cores = [kegg_to_id[c] for c in Core if c not in Currency] core_sum = np.sum(prodMat[scopeRxns][:, cores], axis=1) lasts = [rxn_id_to_kegg[e] for e in avrxns[np.where(core_sum)[0]]] mlasts = [int(l[1:]) for l in lasts] return uniqify( kos_to_rxns.loc[kos_to_rxns['rid'].isin(mlasts)]['id'].tolist())
def intReactions(genotype, firsts, lasts): kos = [pos_to_gene_map[i] for i in np.where((genotype > 0.5) * 1)[0]] refDF = pd.read_csv('KOREF.txt', delimiter='\t', sep='delimiter', header=None, names=['id', 'rid']) def give_number(string): return int(string[-5:]) refDF['id'] = refDF['id'].apply(give_number) refDF['rid'] = refDF['rid'].apply(give_number) kos_to_rxns = refDF.loc[refDF['id'].isin(kos), :] can = refDF.loc[refDF['id'].isin(kos), :]['rid'].values can = ['R' + (5 - len(str(int(e)))) * '0' + str(int(e)) for e in can] can = list((set(can)) & set(rxns)) avrxns = np.array([rxn_kegg_to_id[e] for e in can]) avScopeRxn = giveScope(rxnMat[avrxns], prodMat[avrxns], seedVec, sumRxnVec[avrxns])[1] scopeRxns = np.nonzero(avrxns * avScopeRxn)[0] ints = [rxn_id_to_kegg[e] for e in avrxns[scopeRxns]] mints = [int(l[1:]) for l in ints] nkos = uniqify( kos_to_rxns.loc[kos_to_rxns['rid'].isin(mints)]['id'].tolist()) kos_to_rxns = refDF.loc[refDF['id'].isin(nkos), :] return [ e for e in kos_to_rxns['id'].tolist() if e not in list(firsts) + list(lasts) ]
def genIndex(host, data): hosts = [] i = 0 while i < len(data["alert"]): hosts.append(data["alert"][i][host]["ip"]) i += 1 index = uniqify(hosts) return index
def genIndex(): hosts = [] i = 0 while i < len(data["alert"]): hosts.append(data["alert"][i][sys_type]["ip"]) i += 1 index = uniqify(hosts) return index
def genRandOrg(pathDict): orgRxns = np.array([]) for coreTBP in Core: # Picking a path at random from the dictionary that generates # the current core molecule. orgRxns = np.append(orgRxns, random.choice(pathDict[coreTBP])) # Returning the unique bunch of reactions that correspond # to the individual. return np.array(uniqify(orgRxns)).astype(int)
def propagate_rxns_for_medium(orgrxns, medium): # Defining the seed set to be the medium and the currency, seedVec = np.zeros(len(rxnMat.T)) seedVec[[kegg_to_id[e] for e in Currency + medium]] = 1 # Getting all the reactions performable by this organism. avrxns = orgrxns[:] # Calculating the metabolites within the scope of # this organism's reaction network. scopeMets = giveScope(rxnMat[avrxns], prodMat[avrxns], seedVec, sumRxnVec[avrxns])[0] # Finding how much of the core is within the network's scope. return uniqify([id_to_kegg[e] for e in np.where(scopeMets)[0]])
def sortedGenOrg(pathDict, sortFunc=fitCost, optFunc=max): orgRxns = np.array([]) for coreTBP in Core: # Creating a list of size of pathways that produce the # current core molecule. sortList = [sortFunc(path) for path in pathDict[coreTBP]] # Picking that path which has smallest size. orgRxns = np.append( orgRxns, pathDict[coreTBP][sortList.index(optFunc(sortList))]) # Returning the unique bunch of reactions that correspond # to the individual. return np.array(uniqify(orgRxns)).astype(int)
def propagate_single_for_medium(org, medium): # Defining the seed set to be the medium and the currency, seedVec = np.zeros(len(rxnMat.T)) seedVec[[kegg_to_id[e] for e in Currency + medium]] = 1 # Getting all the reactions performable by this organism. can = ''.join(open('strain_reactions/' + org + '.txt', 'r').readlines()).split() can = list((set(can)) & set(rxns)) avrxns = [rxn_kegg_to_id[e] for e in can] # Calculating the metabolites within the scope of # this organism's reaction network. scopeMets = giveScope(rxnMat[avrxns], prodMat[avrxns], seedVec, sumRxnVec[avrxns])[0] # Finding how much of the core is within the network's scope. return uniqify([id_to_kegg[e] for e in np.where(scopeMets)[0]])
def propagate_core_for_medium(corerxns, medium): # Defining the seed set to be the medium and the currency, seedVec = np.zeros(len(rxnMat.T)) seedVec[[kegg_to_id[e] for e in Currency + medium]] = 1 # Getting all the reactions performable by these corerxns. can = ['R' + (5 - len(str(int(e)))) * '0' + str(int(e)) for e in corerxns] can = list((set(can)) & set(rxns)) avrxns = [rxn_kegg_to_id[e] for e in can] # Calculating the metabolites within the scope of # this organism's reaction network. scopeMets = giveScope(rxnMat[avrxns], prodMat[avrxns], seedVec, sumRxnVec[avrxns])[0] # Finding how much of the core is within the network's scope. return uniqify([id_to_kegg[e] for e in np.where(scopeMets)[0]])
def core_rxns(strains, CUTOFF=0.95): all_sets = [] for org in strains: # Getting all the reactions performable by this organism. can = ''.join(open('strain_reactions/' + org + '.txt', 'r').readlines()).split() can = list((set(can)) & set(rxns)) all_sets.append(set([rxn_kegg_to_id[e] for e in can])) pangenes = uniqify(unlistify(all_sets)) core_genes = [] for gene in pangenes: g_frac = 0.0 for oi, org in enumerate(strains): if gene in all_sets[oi]: g_frac += 1 if g_frac / len(strains) >= CUTOFF: core_genes.append(gene) return core_genes
import requests import pandas as pd from bs4 import BeautifulSoup from tqdm import tqdm import urllib.request import numpy as np from uniqify import uniqify from unlistify import unlistify import os import json from load_kegg import * # Getting the set of bacterial abbreviations for each organism in KEGG. pangenome_df = pd.read_csv('fuller_pangenome_df.csv') old_pangenome_df = pd.read_csv('pangenome_df.csv') species = uniqify(list(pangenome_df['species'].values)) old_species = uniqify(list(old_pangenome_df['species'].values)) all_strains = uniqify(list(pangenome_df['strain'].values)) strain_abbrs = uniqify(list(pangenome_df['kegg_abbr'].values)) index_array = np.array(list(range(len(all_strains)))) # Getting all SEED organisms. seed_orgs = pd.read_excel('seed_orgs.xlsx') list_seed_names = list(seed_orgs['seed_name'].values) seed_onames = [s[s.find("(")+1:s.find(")")] for s in list_seed_names] smap = dict(zip(seed_onames, list_seed_names)) exact_in_seed_dict = {} exact_species = {} for s in list(set(seed_onames) & set(all_strains)): exact_in_seed_dict[smap[s]] = pangenome_df.loc[
from load_data import * from uniqify import uniqify from unlistify import unlistify coreProdRxns = {coreTBP: (prodMat[:, coreTBP] == 1) * 1 for coreTBP in Core} frglList = [] NUM_RAND_ORGS = 1000 for thisIter in tqdm(range(NUM_RAND_ORGS)): orgPathDict = {} for coreTBP in Core: orgPathDict[coreTBP] = list( random.choice(pathDict[coreTBP]).astype(int)) # Storing the list of reactions. orgRxns = np.array(uniqify(unlistify(orgPathDict.values()))).astype(int) # Moving through all reactions in the organism. remRxn = np.random.choice(orgRxns) thisRxnList = [] for coreTBP in Core: if remRxn in orgPathDict[coreTBP]: thisRxnList.append(coreTBP) frglList.append(len(thisRxnList)) import matplotlib.pyplot as plt fig, ax = plt.subplots(1) myWeights = np.ones_like(frglList) / len(frglList) ax.hist(frglList, bins=7, color='gray', weights=myWeights)
if quitFlag: break while True: if tTried > 1e5: quitFlag = True break tFlag = False tTried += 1 # Generate a random organism. while True: orgPathDict = {} for coreTBP in Core: orgPathDict[ coreTBP ] = list( random.choice( pathDict[ coreTBP ] ).astype(int) ) # Storing the list of reactions. orgRxns = np.array( uniqify( unlistify( orgPathDict.values() ) ) ).astype( int ) # Creating a dictionary of secretions. orgSecDict = {} for coreTBP in Core: orgSecDict[ coreTBP ] = list( np.nonzero( pathwaySecByproducts( orgPathDict[ coreTBP ], orgRxns, rxnMat, prodMat, Core ) )[0] ) # Purging duplicates to have only unique byproducts. tempSet = set() fullSet = unlistify( orgSecDict.values() ) duplicates = set(x for x in fullSet if x in tempSet or tempSet.add(x)) orgSecDict = { coreTBP: list( set( orgSecDict[ coreTBP ] ).difference( duplicates ) ) for coreTBP in Core }
tO = anc_recon_table.loc[anc_recon_table['Node'] == node.name[1:-1]] else: tO = anc_recon_table.loc[anc_recon_table['Node'] == node.name] return tO['Prob'].values # Now traversing the tree and inferring ancestral states for all unmarked nodes. for thisNode in nodes: try: thisNode.genotype except: thisNode.add_feature('genotype', reconAncestor(anc_recon_table, thisNode)) gene_ids = sorted(uniqify(unlistify(list(geneDict.values())))) gene_ids = list(np.array(gene_ids)[good_indices]) # Using first ancestral genotype inference method to calculate gains and losses. def giveGainsAndLosses(parent, child): gainGenes, lostGenes = set(), set() for indx, geneID in enumerate(gene_ids): parentProb, childProb = parent.genotype[indx], child.genotype[indx] # Order is present, absent, gain and loss. prsnProb = parentProb * childProb absnProb = (1 - parentProb) * (1 - childProb) gainProb = (1 - parentProb) * childProb lossProb = parentProb * (1 - childProb)
def _get_unique_slices_list(day): weather_slices = WeatherTimeSlice.objects.filter(day_of_occurance=day) return uniqify(weather_slices, lambda x: x.id)
def _get_unique_weather_list(site): wwq = WeatherWatchQueue.objects.get(relevant_site=site) raw_weather_list = DayOfWeather.objects.filter(weather_stream=wwq).order_by("date_it_happens") return uniqify(raw_weather_list, lambda x: x.as_machine_timestring())
# urllib.request.urlretrieve( thisURL, saveDir + thisOrg + '.txt' ) # Tracking the biomass reaction name in each organism's reaction set. saveDir = 'bigg_orgs/' biomass_url_holder = 'http://bigg.ucsd.edu/api/v2/models/' org_bm_list = [] for thisOrg in tqdm(org_biggids): with open(saveDir + thisOrg + '.txt', 'r') as f: s = f.readline() thisBMname = re.findall(r'\"bigg_id\": \"(BIOMASS.*?)\"', s)[0] org_bm_list.append(thisBMname) # thisURL = biomass_url_holder + thisOrg + '/reactions/' + thisBMname # urllib.request.urlretrieve(thisURL, saveDir + thisBMname + '.txt') # Getting the unique list. org_bm_list = uniqify(org_bm_list) #------------------------------------------------------------------------- # Now mapping the metabolite names to possible KEGG IDs. #------------------------------------------------------------------------- bmKEGGS = [] for thisBMname in tqdm(org_bm_list): thisbm = open(saveDir + thisBMname + '.txt', 'r').readline() biggids = re.findall(r'\"bigg_id\": \"(.*?)\"', thisbm)[:-2] bmMetsDF = pd.DataFrame({'universal_bigg_id': biggids}) filt_bigg_df = bigg_df.merge(bmMetsDF, on='universal_bigg_id') relLinks = list(filt_bigg_df['database_links'].values) # Now identifying all the KEGG IDs. for tl in relLinks: try:
baddeds_all = [] cwastes_all = [] while len(fitterDB) < 500: print_progress_bar(len(fitterDB), 500, 'Building mutualisms database') while True: tFlag = False tTried += 1 # Generate a random organism. while True: orgPathDict = {} for coreTBP in Core: orgPathDict[coreTBP] = list( random.choice(pathDict[coreTBP]).astype(int)) # Storing the list of reactions. orgRxns = np.array(uniqify(unlistify( orgPathDict.values()))).astype(int) # Creating a dictionary of secretions. orgSecDict = {} for coreTBP in Core: orgSecDict[coreTBP] = list( np.nonzero( pathwaySecByproducts(orgPathDict[coreTBP], orgRxns, rxnMat, prodMat, Core))[0]) # Purging duplicates to have only unique byproducts. tempSet = set() fullSet = unlistify(orgSecDict.values()) duplicates = set(x for x in fullSet if x in tempSet or tempSet.add(x)) orgSecDict = {
'r').readlines()).split() can = list((set(can)) & set(rxns)) avrxns = [rxn_kegg_to_id[e] for e in can] # Calculating the metabolites within the scope of # this organism's reaction network. scopeMets = giveScope(rxnMat[avrxns], prodMat[avrxns], seedVec, sumRxnVec[avrxns])[0] # Finding how much of the core is within the network's scope. return uniqify([id_to_kegg[e] for e in np.where(scopeMets)[0]]) # Getting the set of bacterial abbreviations for each organism in KEGG. pangenome_df = pd.read_csv('pangenome_df.csv') species = uniqify(list(pangenome_df['species'].values)) all_strains = uniqify(list(pangenome_df['kegg_abbr'].values)) index_array = np.array(list(range(len(all_strains)))) # Generating a vector of all metabolites initially provided, i.e. seeds. seeds_df = pd.read_csv('../black_queen_critique/seeds_from_vitkup.csv') media = list(seeds_df['kegg_id'].values) media_sets = list(itertools.combinations(media, 1)) # Generating the null distribution. # Getting random pairs. NUM_SAMPLES = 1000 sample_indices = collect_samples(index_array, 2, NUM_SAMPLES)[:NUM_SAMPLES] samples = [[all_strains[j] for j in sample_indices[i]] for i in range(len(sample_indices))]