def lastReactions(genotype):
    kos = [pos_to_gene_map[i] for i in np.where((genotype > 0.5) * 1)[0]]
    refDF = pd.read_csv('KOREF.txt',
                        delimiter='\t',
                        sep='delimiter',
                        header=None,
                        names=['id', 'rid'])

    def give_number(string):
        return int(string[-5:])

    refDF['id'] = refDF['id'].apply(give_number)
    refDF['rid'] = refDF['rid'].apply(give_number)
    kos_to_rxns = refDF.loc[refDF['id'].isin(kos), :]
    can = refDF.loc[refDF['id'].isin(kos), :]['rid'].values
    can = ['R' + (5 - len(str(int(e)))) * '0' + str(int(e)) for e in can]
    can = list((set(can)) & set(rxns))
    avrxns = np.array([rxn_kegg_to_id[e] for e in can])
    avrxns = np.array([rxn_kegg_to_id[e] for e in can])
    avScopeRxn = giveScope(rxnMat[avrxns], prodMat[avrxns], seedVec,
                           sumRxnVec[avrxns])[1]
    scopeRxns = np.nonzero(avrxns * avScopeRxn)[0]
    cores = [kegg_to_id[c] for c in Core if c not in Currency]
    core_sum = np.sum(prodMat[scopeRxns][:, cores], axis=1)
    lasts = [rxn_id_to_kegg[e] for e in avrxns[np.where(core_sum)[0]]]
    mlasts = [int(l[1:]) for l in lasts]
    return uniqify(
        kos_to_rxns.loc[kos_to_rxns['rid'].isin(mlasts)]['id'].tolist())
def intReactions(genotype, firsts, lasts):
    kos = [pos_to_gene_map[i] for i in np.where((genotype > 0.5) * 1)[0]]
    refDF = pd.read_csv('KOREF.txt',
                        delimiter='\t',
                        sep='delimiter',
                        header=None,
                        names=['id', 'rid'])

    def give_number(string):
        return int(string[-5:])

    refDF['id'] = refDF['id'].apply(give_number)
    refDF['rid'] = refDF['rid'].apply(give_number)
    kos_to_rxns = refDF.loc[refDF['id'].isin(kos), :]
    can = refDF.loc[refDF['id'].isin(kos), :]['rid'].values
    can = ['R' + (5 - len(str(int(e)))) * '0' + str(int(e)) for e in can]
    can = list((set(can)) & set(rxns))
    avrxns = np.array([rxn_kegg_to_id[e] for e in can])
    avScopeRxn = giveScope(rxnMat[avrxns], prodMat[avrxns], seedVec,
                           sumRxnVec[avrxns])[1]
    scopeRxns = np.nonzero(avrxns * avScopeRxn)[0]
    ints = [rxn_id_to_kegg[e] for e in avrxns[scopeRxns]]
    mints = [int(l[1:]) for l in ints]
    nkos = uniqify(
        kos_to_rxns.loc[kos_to_rxns['rid'].isin(mints)]['id'].tolist())
    kos_to_rxns = refDF.loc[refDF['id'].isin(nkos), :]
    return [
        e for e in kos_to_rxns['id'].tolist()
        if e not in list(firsts) + list(lasts)
    ]
Beispiel #3
0
def genIndex(host, data):
    hosts = []
    i = 0
    while i < len(data["alert"]):
        hosts.append(data["alert"][i][host]["ip"])
        i += 1
    index = uniqify(hosts)
    return index
def genIndex():
    hosts = []
    i = 0
    while i < len(data["alert"]):
        hosts.append(data["alert"][i][sys_type]["ip"])
        i += 1
    index = uniqify(hosts)
    return index
Beispiel #5
0
def genRandOrg(pathDict):
    orgRxns = np.array([])
    for coreTBP in Core:

        # Picking a path at random from the dictionary that generates
        # the current core molecule.
        orgRxns = np.append(orgRxns, random.choice(pathDict[coreTBP]))

    # Returning the unique bunch of reactions that correspond
    # to the individual.
    return np.array(uniqify(orgRxns)).astype(int)
Beispiel #6
0
def propagate_rxns_for_medium(orgrxns, medium):
    # Defining the seed set to be the medium and the currency,
    seedVec = np.zeros(len(rxnMat.T))
    seedVec[[kegg_to_id[e] for e in Currency + medium]] = 1

    # Getting all the reactions performable by this organism.
    avrxns = orgrxns[:]

    # Calculating the metabolites within the scope of
    # this organism's reaction network.
    scopeMets = giveScope(rxnMat[avrxns], prodMat[avrxns], seedVec,
                          sumRxnVec[avrxns])[0]

    # Finding how much of the core is within the network's scope.
    return uniqify([id_to_kegg[e] for e in np.where(scopeMets)[0]])
Beispiel #7
0
def sortedGenOrg(pathDict, sortFunc=fitCost, optFunc=max):
    orgRxns = np.array([])
    for coreTBP in Core:

        # Creating a list of size of pathways that produce the
        # current core molecule.
        sortList = [sortFunc(path) for path in pathDict[coreTBP]]

        # Picking that path which has smallest size.
        orgRxns = np.append(
            orgRxns, pathDict[coreTBP][sortList.index(optFunc(sortList))])

    # Returning the unique bunch of reactions that correspond
    # to the individual.
    return np.array(uniqify(orgRxns)).astype(int)
Beispiel #8
0
def propagate_single_for_medium(org, medium):
    # Defining the seed set to be the medium and the currency,
    seedVec = np.zeros(len(rxnMat.T))
    seedVec[[kegg_to_id[e] for e in Currency + medium]] = 1

    # Getting all the reactions performable by this organism.
    can = ''.join(open('strain_reactions/' + org + '.txt', 'r').readlines()).split()
    can = list((set(can)) & set(rxns))
    avrxns = [rxn_kegg_to_id[e] for e in can]

    # Calculating the metabolites within the scope of 
    # this organism's reaction network. 
    scopeMets = giveScope(rxnMat[avrxns], prodMat[avrxns], seedVec, sumRxnVec[avrxns])[0]
    
    # Finding how much of the core is within the network's scope.
    return uniqify([id_to_kegg[e] for e in np.where(scopeMets)[0]])
Beispiel #9
0
def propagate_core_for_medium(corerxns, medium):
    # Defining the seed set to be the medium and the currency,
    seedVec = np.zeros(len(rxnMat.T))
    seedVec[[kegg_to_id[e] for e in Currency + medium]] = 1

    # Getting all the reactions performable by these corerxns.
    can = ['R' + (5 - len(str(int(e)))) * '0' + str(int(e)) for e in corerxns]
    can = list((set(can)) & set(rxns))
    avrxns = [rxn_kegg_to_id[e] for e in can]

    # Calculating the metabolites within the scope of 
    # this organism's reaction network. 
    scopeMets = giveScope(rxnMat[avrxns], prodMat[avrxns], seedVec, sumRxnVec[avrxns])[0]
    
    # Finding how much of the core is within the network's scope.
    return uniqify([id_to_kegg[e] for e in np.where(scopeMets)[0]])
Beispiel #10
0
def core_rxns(strains, CUTOFF=0.95):
    all_sets = []
    for org in strains:
        # Getting all the reactions performable by this organism.
        can = ''.join(open('strain_reactions/' + org + '.txt', 'r').readlines()).split()
        can = list((set(can)) & set(rxns))
        all_sets.append(set([rxn_kegg_to_id[e] for e in can]))

    pangenes = uniqify(unlistify(all_sets))
    core_genes = []
    for gene in pangenes:
        g_frac = 0.0
        for oi, org in enumerate(strains):
            if gene in all_sets[oi]:
                g_frac += 1
        if g_frac / len(strains) >= CUTOFF:
            core_genes.append(gene)

    return core_genes
Beispiel #11
0
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import urllib.request
import numpy as np
from uniqify import uniqify
from unlistify import unlistify
import os
import json
from load_kegg import *

# Getting the set of bacterial abbreviations for each organism in KEGG.
pangenome_df = pd.read_csv('fuller_pangenome_df.csv')
old_pangenome_df = pd.read_csv('pangenome_df.csv')
species = uniqify(list(pangenome_df['species'].values))
old_species = uniqify(list(old_pangenome_df['species'].values))
all_strains = uniqify(list(pangenome_df['strain'].values))
strain_abbrs = uniqify(list(pangenome_df['kegg_abbr'].values))
index_array = np.array(list(range(len(all_strains))))

# Getting all SEED organisms.
seed_orgs = pd.read_excel('seed_orgs.xlsx')
list_seed_names = list(seed_orgs['seed_name'].values)
seed_onames = [s[s.find("(")+1:s.find(")")] for s in list_seed_names]
smap = dict(zip(seed_onames, list_seed_names))

exact_in_seed_dict = {}
exact_species = {}
for s in list(set(seed_onames) & set(all_strains)):
    exact_in_seed_dict[smap[s]] = pangenome_df.loc[
from load_data import *
from uniqify import uniqify
from unlistify import unlistify

coreProdRxns = {coreTBP: (prodMat[:, coreTBP] == 1) * 1 for coreTBP in Core}
frglList = []
NUM_RAND_ORGS = 1000

for thisIter in tqdm(range(NUM_RAND_ORGS)):
    orgPathDict = {}
    for coreTBP in Core:
        orgPathDict[coreTBP] = list(
            random.choice(pathDict[coreTBP]).astype(int))

    # Storing the list of reactions.
    orgRxns = np.array(uniqify(unlistify(orgPathDict.values()))).astype(int)

    # Moving through all reactions in the organism.
    remRxn = np.random.choice(orgRxns)
    thisRxnList = []
    for coreTBP in Core:
        if remRxn in orgPathDict[coreTBP]:
            thisRxnList.append(coreTBP)

    frglList.append(len(thisRxnList))

import matplotlib.pyplot as plt

fig, ax = plt.subplots(1)
myWeights = np.ones_like(frglList) / len(frglList)
ax.hist(frglList, bins=7, color='gray', weights=myWeights)
Beispiel #13
0
            if quitFlag:
                break
            while True:
                if tTried > 1e5:
                    quitFlag = True
                    break
                tFlag = False
                tTried += 1
                # Generate a random organism.
                while True:
                    orgPathDict = {}
                    for coreTBP in Core:
                        orgPathDict[ coreTBP ] = list( random.choice( pathDict[ coreTBP ] ).astype(int) )
                    
                    # Storing the list of reactions.
                    orgRxns = np.array( uniqify( unlistify( orgPathDict.values() ) ) ).astype( int )

                    # Creating a dictionary of secretions.
                    orgSecDict = {}
                    for coreTBP in Core:
                        orgSecDict[ coreTBP ] = list( np.nonzero( pathwaySecByproducts( 
                                                      orgPathDict[ coreTBP ], orgRxns, 
                                                      rxnMat, prodMat, Core ) )[0] )
                    
                    # Purging duplicates to have only unique byproducts.
                    tempSet = set()
                    fullSet = unlistify( orgSecDict.values() )
                    duplicates = set(x for x in fullSet if x in tempSet or tempSet.add(x))
                    orgSecDict = { coreTBP: list( set( orgSecDict[ coreTBP ] ).difference( duplicates ) ) 
                                   for coreTBP in Core }
Beispiel #14
0
        tO = anc_recon_table.loc[anc_recon_table['Node'] == node.name[1:-1]]
    else:
        tO = anc_recon_table.loc[anc_recon_table['Node'] == node.name]

    return tO['Prob'].values


# Now traversing the tree and inferring ancestral states for all unmarked nodes.
for thisNode in nodes:
    try:
        thisNode.genotype
    except:
        thisNode.add_feature('genotype',
                             reconAncestor(anc_recon_table, thisNode))

gene_ids = sorted(uniqify(unlistify(list(geneDict.values()))))
gene_ids = list(np.array(gene_ids)[good_indices])


# Using first ancestral genotype inference method to calculate gains and losses.
def giveGainsAndLosses(parent, child):
    gainGenes, lostGenes = set(), set()
    for indx, geneID in enumerate(gene_ids):
        parentProb, childProb = parent.genotype[indx], child.genotype[indx]

        # Order is present, absent, gain and loss.
        prsnProb = parentProb * childProb
        absnProb = (1 - parentProb) * (1 - childProb)
        gainProb = (1 - parentProb) * childProb
        lossProb = parentProb * (1 - childProb)
Beispiel #15
0
def _get_unique_slices_list(day):
    weather_slices = WeatherTimeSlice.objects.filter(day_of_occurance=day)
    return uniqify(weather_slices, lambda x: x.id)
Beispiel #16
0
def _get_unique_weather_list(site):
    wwq = WeatherWatchQueue.objects.get(relevant_site=site)
    raw_weather_list = DayOfWeather.objects.filter(weather_stream=wwq).order_by("date_it_happens")
    return uniqify(raw_weather_list, lambda x: x.as_machine_timestring())
#     urllib.request.urlretrieve( thisURL, saveDir + thisOrg + '.txt' )

# Tracking the biomass reaction name in each organism's reaction set.
saveDir = 'bigg_orgs/'
biomass_url_holder = 'http://bigg.ucsd.edu/api/v2/models/'
org_bm_list = []
for thisOrg in tqdm(org_biggids):
    with open(saveDir + thisOrg + '.txt', 'r') as f:
        s = f.readline()
        thisBMname = re.findall(r'\"bigg_id\": \"(BIOMASS.*?)\"', s)[0]
        org_bm_list.append(thisBMname)
        # thisURL = biomass_url_holder + thisOrg + '/reactions/' + thisBMname
        # urllib.request.urlretrieve(thisURL, saveDir + thisBMname + '.txt')

# Getting the unique list.
org_bm_list = uniqify(org_bm_list)

#-------------------------------------------------------------------------
# Now mapping the metabolite names to possible KEGG IDs.
#-------------------------------------------------------------------------
bmKEGGS = []
for thisBMname in tqdm(org_bm_list):
    thisbm = open(saveDir + thisBMname + '.txt', 'r').readline()
    biggids = re.findall(r'\"bigg_id\": \"(.*?)\"', thisbm)[:-2]
    bmMetsDF = pd.DataFrame({'universal_bigg_id': biggids})
    filt_bigg_df = bigg_df.merge(bmMetsDF, on='universal_bigg_id')
    relLinks = list(filt_bigg_df['database_links'].values)

    # Now identifying all the KEGG IDs.
    for tl in relLinks:
        try:
Beispiel #18
0
baddeds_all = []
cwastes_all = []
while len(fitterDB) < 500:
    print_progress_bar(len(fitterDB), 500, 'Building mutualisms database')
    while True:
        tFlag = False
        tTried += 1
        # Generate a random organism.
        while True:
            orgPathDict = {}
            for coreTBP in Core:
                orgPathDict[coreTBP] = list(
                    random.choice(pathDict[coreTBP]).astype(int))

            # Storing the list of reactions.
            orgRxns = np.array(uniqify(unlistify(
                orgPathDict.values()))).astype(int)

            # Creating a dictionary of secretions.
            orgSecDict = {}
            for coreTBP in Core:
                orgSecDict[coreTBP] = list(
                    np.nonzero(
                        pathwaySecByproducts(orgPathDict[coreTBP], orgRxns,
                                             rxnMat, prodMat, Core))[0])

            # Purging duplicates to have only unique byproducts.
            tempSet = set()
            fullSet = unlistify(orgSecDict.values())
            duplicates = set(x for x in fullSet
                             if x in tempSet or tempSet.add(x))
            orgSecDict = {
Beispiel #19
0
                       'r').readlines()).split()
    can = list((set(can)) & set(rxns))
    avrxns = [rxn_kegg_to_id[e] for e in can]

    # Calculating the metabolites within the scope of
    # this organism's reaction network.
    scopeMets = giveScope(rxnMat[avrxns], prodMat[avrxns], seedVec,
                          sumRxnVec[avrxns])[0]

    # Finding how much of the core is within the network's scope.
    return uniqify([id_to_kegg[e] for e in np.where(scopeMets)[0]])


# Getting the set of bacterial abbreviations for each organism in KEGG.
pangenome_df = pd.read_csv('pangenome_df.csv')
species = uniqify(list(pangenome_df['species'].values))
all_strains = uniqify(list(pangenome_df['kegg_abbr'].values))
index_array = np.array(list(range(len(all_strains))))

# Generating a vector of all metabolites initially provided, i.e. seeds.
seeds_df = pd.read_csv('../black_queen_critique/seeds_from_vitkup.csv')
media = list(seeds_df['kegg_id'].values)
media_sets = list(itertools.combinations(media, 1))

# Generating the null distribution.
# Getting random pairs.
NUM_SAMPLES = 1000
sample_indices = collect_samples(index_array, 2, NUM_SAMPLES)[:NUM_SAMPLES]
samples = [[all_strains[j] for j in sample_indices[i]]
           for i in range(len(sample_indices))]