def convert_timeseries_to_intervalseries(timeseries, yaxis_only=False): ''' Will note accept any negative intervals (since that shouldn't be possible input: timeseries: [[numeric_date_from_start, arbitrary value of interest], [].... yaxis_only: False, by default. if True then the return is [20, 6, ...] output: intervalseries: [[0, 20], [1, 6], ... Outputs the orderd series of gaps between dates ''' intervalseries = [] for i, dtpoint in enumerate(timeseries[:-1]): #Unpack the dates idate = dtpoint[0] jdate = timeseries[i+1][0] #Perform the calculation interval = jdate - idate #Go through the options, break if it is negative if interval < 0: m = 'Negative interval detected, this should not be an out of order timeseries' gerr.generic_error_handler(message = m) elif yaxis_only: intervalseries.append(interval) else: intervalseries.append([i, interval]) return intervalseries
def from_list(settings, data, naming_scheme, exclusions=[], chunk_size=100000, inclusion={}): ''' input: settings - mongo connection settings (mongoConnect standard) data - list of lists. must correspond to headers in nameing scheme naming_scheme - dictionary mapping inset list index to column/key name to use (optional) exclusions - headers to ignore in naming_scheme. Must be value in dictionary. output: None dependencies: gale ''' import gale.databases.mongoConnect as mcxn import gale.general.errors as err def _transform_row(theader, trow): if inclusion: tdict = dict(inclusion) else: tdict = {} for i, tkey in theader.items(): if trow[i].lower() != 'null': tdict[tkey] = trow[i] return tdict #Handle the exclusion first if exclusions: for tval in exclusions: itemset = [i for i, j in naming_scheme.items() if j == tval] if len(itemset) > 1: m = 'populateMongo.from_list, line 28\n' m += 'More than one value in naming scheme matches the exclusion value' err.generic_error_handler(message=m) del naming_scheme[itemset[0]] #Check to make sure all values are distinct if len(naming_scheme.values()) != len(list(set(naming_scheme.values()))): m = 'populateMongo.from_list, line 28\n' m += 'Values in naming scheme not unique' err.generic_error_handler(message=m) data = [_transform_row(naming_scheme, datarow) for datarow in data] #iterate over subsets of the list tdb = mcxn.MongoConnection(settings) if len(data) < chunk_size: tdb.collection.insert(data) else: for i in range(0, len(data), chunk_size): tdb.collection.insert(data[i:i + chunk_size]) tdb.tearDown()
def gini(data): ''' Calculates the gini coefficient for a given dataset. input: data- list of values, either raw counts or frequencies. Frequencies MUST sum to 1.0, otherwise will be transformed to frequencies If raw counts data will be transformed to frequencies. output: gini- float, from 0.0 to 1.0 (1.0 most likely never realized since it is only achieved in the limit) ''' def _unit_area(height, value, width): ''' Calculates a single bars area. Area is composed of two parts: The height of the bar up until that point The addition from the current value (calculated as a triangle) input: height: previous bar height or sum of values up to current value value: current value width: width of individual bar output: bar_area: area of current bar ''' bar_area = (height * width) + ((value * width) / 2.) return bar_area #Fair area will always be 0.5 when frequencies are used fair_area = 0.5 #Check that input data has non-zero values, if not throw an error datasum = float(sum(data)) if datasum == 0: m = 'Data sum is 0.0.\nCannot calculate Gini coefficient for non-responsive population.' gerr.generic_error_handler(message=m) elif datasum < 0.99: m = 'Data sum is frequencies and less than 1.0.' gerr.generic_error_handler(message=m) #If data does not sum to 1.0 transform to frequencies elif datasum > 1.0: data = [x / datasum for x in data] #Calculate the area under the curve for the current dataset data.sort() width = 1 / float(len(data)) height, area = 0.0, 0.0 for value in data: area += _unit_area(height, value, width) height += value #Calculate the gini gini = (fair_area - area) / fair_area return gini
def fold_change(obs, exp): ''' Rescales the observation (either a single value or list) by the expected value Cannot accept zero as the expected value input: obs -- int/float or list of int/floats exp -- int/float output: norm -- int/float or list of int/floats ''' if exp == 0: m = "Cannot accept zero as an expected value" gerr.generic_error_handler(message=m) elif type(obs) == list: norm = [ival / float(exp) for ival in obs] else: norm = obs / float(exp) return norm
def create_all_combinations(n): ''' Creates all possible combinations up to length k, where k equals the size of list n. Maximum size is 10. because this would just get stupid otherwise input: n - list of values output: combs - of all combinations ''' from itertools import combinations #exception if len(n) > 10: import gale.general.errors as gerr m = 'List length too long for this function' gerr.generic_error_handler(message=m) #Go through the sizes combs = [] for i in range(len(n)): size = i + 1 combs += combinations(n, size) return combs
def parse_infomap(comfile, netfile='', hierarchy=True): ''' Parses an infomap community file. Returns the (now) standard mod2node and node2mod dictionaries If recursion is wanted then the will ... ****Network features still missing***** inputs: comfile- name of infomap community file netfile- network file, will have more information hierarchy- boolean, toplevel or all levels. outputs: mod2node - dictionary with module class ''' import sys try: import networkx as nx except ImportError: hierarchy = False print >> sys.stderr, "NetworkX is not available" print >> sys.stderr, "Any network features of the modules will not be calculated" import gale.general.errors as gerr class Module(object): ''' The module class Contains information related to sub-modules, such as sub-hierarchy, connecting modules, connecting partners, size, and nodes ''' def __init__(self): ##Level - 0, 1, 2, 3 self.level = None ##Hierarchy self.children = False self.submodules = [] ##Size related self.nodes = [] self.size = None ##Network characteristics #module:link strength self.connect_modules = {} #node: outside_node self.connect_nodes = {} def attribute_generators(self): self.size = len(self.nodes) def _reader(fname): ''' Read the input file, ignore comments that are # ''' data = [] for line in open(comfile).readlines(): if line[0]=='#': #Comment pass elif line=='' or line==' ' or line=='\t' or line=='\n': #Blank line pass else: sline = line.split() #First part is modules, then numeric, then node name modlisting = sline[0].split(':') del modlisting[-1] #Kill the double quotes, join anything split with aspace in the node name #nodename = ' '.join(sline[2:])[1:-1] nodename = sline[-1] data.append([modlisting, nodename]) return data #Reference variables mod2node = {} node2mod = {} #Read in the community data comdata = _reader(comfile) for mods, node in comdata: #Start with the easy part first if node in node2mod: m='Duplicitous node identifier' gerr.generic_error_handler(message=m) #CHeck on hierarchy if hierarchy == True: modname = '-'.join(mods) else: modname = mods[0] #Add the node2mod node2mod[node] = modname #Go through the modules in the listing for i in range(len(mods)): tmod = '-'.join(mods[:i+1]) #Start the class if tmod not in mod2node: mod2node[tmod] = Module() mod2node[tmod].level = i #Check if there are children if len(mods) > (i+1): mod2node[tmod].children = True mod2node[tmod].submodules.append(mods[i+1]) #Class upkeeping mod2node[tmod].nodes.append(node) #class upkeeping for module in mod2node: mod2node[module].attribute_generators() return mod2node, node2mod