def subprocessPerLayer(layer_names, parm): # Spawn the subprocesses to calculate stats for each layer # The number of pairs to compare including compare to self pairCount = len(parm['statsLayers']) ** 2 pool.hostProcessorMsg() print 'Starting to build', pairCount, 'layer pairs...' """ # TODO easy testing without subprocesses for layerA in parm['statsLayers']: parm['layerA'] = layerA parm['layerIndex'] = layer_names.index(layerA) oneLayer = ForEachLayer(parm) oneLayer() """ # Handle the stats for each layer, in parallel allLayers = [] for layerA in parm['statsLayers']: parm['layerA'] = layerA parm['layerIndex'] = layer_names.index(layerA) allLayers.append(ForEachLayer(parm)) print pool.hostProcessorMsg() print len(parm['statsLayers']), 'subprocesses to run, one per layer.' pool.runSubProcesses(allLayers)
def subprocessPerLayer(layer_names, parm): # Spawn the subprocesses to calculate stats for each layer # The number of pairs to compare including compare to self pairCount = len(parm['statsLayers'])**2 pool.hostProcessorMsg() print 'Starting to build', pairCount, 'layer pairs...' """ # TODO easy testing without subprocesses for layerA in parm['statsLayers']: parm['layerA'] = layerA parm['layerIndex'] = layer_names.index(layerA) oneLayer = ForEachLayer(parm) oneLayer() """ # Handle the stats for each layer, in parallel allLayers = [] for layerA in parm['statsLayers']: parm['layerA'] = layerA parm['layerIndex'] = layer_names.index(layerA) allLayers.append(ForEachLayer(parm)) print pool.hostProcessorMsg() print len(parm['statsLayers']), 'subprocesses to run, one per layer.' pool.runSubProcesses(allLayers)
def normalized_pearson_statistics(layers, layerNames, nodes_multiple, ctx, options): # For every binary layer and for every layout, we need a file called # layer_<layer number>_<layout number>_region.tab with scores associating # that layer with other layers of its type. This uses a normalized pearson # correlation to produce a score between -1 and 1, where -1 is the most # anticorrelated, 0 is no correlation and 1 is the most correlated. # # For binary data we are using a normalized pearson correlation that # accounts for sample biases. We are using these logical steps to get there: # # 1. Use the extents of the node positions as the first window. # # 2. Divide the window into a 2 X 2 grid containing 4 new equal-sized windows. # # 3. Scan across all nodes, placing them into the appropiate window. These # are stored as a vector of windows, with a vector of node names in each # window. For the toy example, we will show node counts rather than names. # toy: C = [2, 1, 3, 5] # # 4. Drop any windows from the vector whose node count is below the # lower_threshold. # toy: with a lower_threshold of 2: # C1 = [2, 3, 5] # # 5. Repeat Steps 2 - 4 for each window containing more nodes than the upper # threshold # # 6. Normalize this vector of counts. For each element: # a. divide by the sum of the elements. # toy: C2 = [1/5, 3/5, 1] # b. multiply by the pseudocount of 5 # toy: C3 = [1, 3, 5] # # 7. Look at each attribute compared with each other attribute. For each # pair of attributes: # a. For attribute A, create a vector with counts of the number # of nodes in which attribute A is 1. Create an element in A for each # window in C1. # toy: A = [1, 2, 2] # # b. Do the same for attribute B. # toy: B = [0, 1, 1] # # c. Compare vectors A and B. For each node in a window that # has a value of one for both A and B, decrement that window count # in both vectors. # toy: if the 2nd window has one node that is in A & B: # A1 = [1, 1, 2], B1 = [0, 0, 1] # # d. For each element of A1 & B2 add the corresponding element of C3. # toy: A2 = [2, 4, 4], B2 = [1, 3, 6] # # e. Correlate vectors A and B with the Pearson method which returns an # R correlation and a P value. for layout in ctx.all_hexagons.iterkeys(): # We look at all layouts for this. # Create the windows containing lists of node names in each window. # Following our naming scheme above, assign C to the curated windows # Note we find the windows even if we are not computing layout-aware # stats so we can use the windowing later for dynamic stats. C = window_tool( options.directory, nodes_multiple[layout], options.mi_window_threshold, options.mi_window_threshold_upper, layout, ) # Transform the nodes lists in C to a list of node counts C1 = map(lambda x: len(x), C) # Normalize the node counts to create the windows addtives: # divide by the sum of the counts and multiply by the pseudocount. Sum = sum(C1) C2 = map(lambda x: float(x) * PSEUDOCOUNT / Sum, C1) # Write the window node counts and additives to a file for use in # dynamic stats initiated by the client filename = os.path.join(options.directory, 'windows_' + str(layout) + '.tab') with open(filename, 'w') as f: f = csv.writer(f, delimiter='\t') i = 0 for nodes in C: line = [C2[i]] for node in nodes: line.append(node) f.writerow(line) i += 1 if not options.mutualinfo: print 'Skipping sort stats for layout-aware' continue if ctx.binary_layers == 0: print 'No binary layers for layout-aware stats to process' continue # The number of pairs to compare without compare to self pairCount = len(ctx.binary_layers) ** 2 - len(ctx.binary_layers) print 'Starting to build', pairCount, 'layer pairs...' # Create the stats parameters parm = { 'directory': options.directory, 'layers': layers, 'layout': str(layout), 'statsLayers': ctx.binary_layers, 'windowAdditives': C2, 'windowNodes': C, } """ # TODO easy testing without subprocesses for layerA in parm['statsLayers']: parm['layerA'] = layerA parm['layerIndex'] = layerNames.index(layerA) oneLayer = ForEachLayer(parm) oneLayer() """ # Handle the stats for each layer, in parallel allLayers = [] for layer in parm['statsLayers']: parm['layerA'] = layer parm['layerIndex'] = layerNames.index(layer) allLayers.append(ForEachLayer(parm)) print pool.hostProcessorMsg() print len(ctx.binary_layers), 'subprocesses to run, one per layer.' pool.runSubProcesses(allLayers) print timestamp(), 'Stats complete for layout:', layout