def single_linkage(points, max_dist=Nmax, min_cluster_size=N): """ points are (x-index, y-index, cscore) per chromosome pair. """ # This is the core single linkage algorithm # this behaves in O(n) complexity: we iterate through the pairs, for each pair # we look back on the adjacent pairs to find links clusters = Grouper() n = len(points) points.sort() for i in xrange(n): for j in xrange(i - 1, -1, -1): # x-axis distance del_x = points[i][0] - points[j][0] if del_x > max_dist: break # y-axis distance del_y = points[i][1] - points[j][1] if del_x + abs(del_y) > max_dist: continue #if abs(del_y) > Nmax: continue # otherwise join clusters.join(points[i], points[j]) clusters = [ cluster for cluster in list(clusters) if len(cluster) >= min_cluster_size ] return clusters
def get_2D_overlap(chain, eclusters): """ Implements a sweep line algorithm, that has better running time than naive O(n^2): assume block has x_ends, and y_ends for the bounds 1. sort x_ends, and take a sweep line to scan the x_ends 2. if left end, test y-axis intersection of current block with `active` set; also put this block in the `active` set 3. if right end, remove block from the `active` set """ mergeables = Grouper() active = set() x_ends = [] for i, (range_x, range_y, score) in enumerate(eclusters): chr, left, right = range_x x_ends.append((chr, left, 0, i)) # 0/1 for left/right-ness x_ends.append((chr, right, 1, i)) x_ends.sort() chr_last = "" for chr, pos, left_right, i in x_ends: if chr != chr_last: active.clear() if left_right==0: active.add(i) for x in active: # check y-overlap if range_overlap(eclusters[x][1], eclusters[i][1]): mergeables.join(x, i) else: # right end active.remove(i) chr_last = chr return mergeables
def generate_sim_records(records, sim_record_name, dis_record_name): sim_prefix = sim_record_name + ':' grouper = Grouper() groups = grouper.group_records(records) output = "" for g in groups.keys(): sim_record_name = get_sim_name(groups[g].main) #Check sim record does not already exist - maybe someone started writing the records but got bored! if sim_record_name in records: continue #Skip record if it is a simulation record if groups[g].main.startswith(sim_prefix): continue #Skip adding sim record if the original is a soft record if records[groups[g].main].dtyp is None or records[groups[g].main].dtyp.lower() == "soft channel": continue #No point simulating SIM or DISABLE if groups[g].RB != sim_record_name and groups[g].RB != dis_record_name: typ = records[groups[g].main].type #Don't add simulation record unless the type is suitable if typ in ALLOWED_SIM_TYPES: print "ADDED SIM RECORD =", sim_record_name output += generate_record_text(records[groups[g].main], groups[g].RB, groups[g].SP, groups[g].SP_RBV) return output
def get_2D_overlap(chain, eclusters): """ Implements a sweep line algorithm, that has better running time than naive O(n^2): assume block has x_ends, and y_ends for the bounds 1. sort x_ends, and take a sweep line to scan the x_ends 2. if left end, test y-axis intersection of current block with `active` set; also put this block in the `active` set 3. if right end, remove block from the `active` set """ mergeables = Grouper() active = set() x_ends = [] for i, (range_x, range_y, score) in enumerate(eclusters): chr, left, right = range_x x_ends.append((chr, left, 0, i)) # 0/1 for left/right-ness x_ends.append((chr, right, 1, i)) x_ends.sort() chr_last = "" for chr, pos, left_right, i in x_ends: if chr != chr_last: active.clear() if left_right == 0: active.add(i) for x in active: # check y-overlap if range_overlap(eclusters[x][1], eclusters[i][1]): mergeables.join(x, i) else: # right end active.remove(i) chr_last = chr return mergeables
def group(): """run the grouper""" logging.debug("Start grouping") groupi = Grouper() # pass True to use mapreduce and False to use combine groupi.run( True ) logging.debug("Finished grouping")
def find_synteny_region(query, sbed, data, window, cutoff, colinear=True): # get all synteny blocks for a query, algorithm is single linkage # anchors are a window centered on query # two categories of syntenic regions depending on what query is: # (Syntelog): syntenic region is denoted by the syntelog # (Gray gene): syntenic region is marked by the closest flanker regions = [] ysorted = sorted(data, key=lambda x: x[1]) g = Grouper() a, b = itertools.tee(ysorted) next(b, None) for ia, ib in itertools.izip(a, b): pos1, pos2 = ia[1], ib[1] if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid: g.join(ia, ib) for group in sorted(g): (qflanker, syntelog), (far_flanker, far_syntelog), flanked = get_flanker(group, query) # y-boundary of the block gs = [x[1] for x in group] left, right = min(gs), max(gs) # run a mini-dagchainer here, take the direction that gives us most anchors orientation = "+" if colinear: y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)] lis = longest_increasing_subsequence(y_indexed_group) lds = longest_decreasing_subsequence(y_indexed_group) if len(lis) >= len(lds): track = lis else: track = lds orientation = "-" group = [group[i] for (y, i) in track] xpos, ypos = zip(*group) score = min(len(set(xpos)), len(set(ypos))) if qflanker == query: gray = "S" else: gray = "G" if not flanked else "F" score -= 1 # slight penalty for not finding syntelog if score < cutoff: continue # this characterizes a syntenic region (left, right). syntelog is -1 if it's a gray gene syn_region = (syntelog, left, right, gray, orientation, score) regions.append(syn_region) return sorted(regions, key=lambda x: -x[-1]) # decreasing synteny score
def find_synteny_region(query, sbed, data, window, cutoff, colinear=True): # get all synteny blocks for a query, algorithm is single linkage # anchors are a window centered on query # two categories of syntenic regions depending on what query is: # (Syntelog): syntenic region is denoted by the syntelog # (Gray gene): syntenic region is marked by the closest flanker regions = [] ysorted = sorted(data, key=lambda x:x[1]) g = Grouper() a, b = itertools.tee(ysorted) next(b, None) for ia, ib in itertools.izip(a, b): pos1, pos2 = ia[1], ib[1] if pos2 - pos1 < window and sbed[pos1].seqid==sbed[pos2].seqid: g.join(ia, ib) for group in sorted(g): (qflanker, syntelog), (far_flanker, far_syntelog), flanked = get_flanker(group, query) # y-boundary of the block gs = [x[1] for x in group] left, right = min(gs), max(gs) # run a mini-dagchainer here, take the direction that gives us most anchors orientation = "+" if colinear: y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)] lis = longest_increasing_subsequence(y_indexed_group) lds = longest_decreasing_subsequence(y_indexed_group) if len(lis) >= len(lds): track = lis else: track = lds orientation = "-" group = [group[i] for (y, i) in track] xpos, ypos = zip(*group) score = min(len(set(xpos)), len(set(ypos))) if qflanker==query: gray = "S" else: gray = "G" if not flanked else "F" score -= 1 # slight penalty for not finding syntelog if score < cutoff: continue # this characterizes a syntenic region (left, right). syntelog is -1 if it's a gray gene syn_region = (syntelog, left, right, gray, orientation, score) regions.append(syn_region) return sorted(regions, key=lambda x: -x[-1]) # decreasing synteny score
def to_groups(self, distance): # not used. g = Grouper() for name, anchors in itertools.groupby(self, key=lambda a, b: (a.seqid, b.seqid)): for ia, qa, sa in enumerate(anchors[:-1]): qb, sb = anchors[ia + 1] if qb.start - qa.end <= distance and sb.start - sa.end <= distance: g.join((qa, sa), (qb, sb)) return g
def test_adding_grouper_objects_together(self): words1 = ["apple", "animal", "lemon", "ANIMAL", "Apple"] words2 = ["Lemon", "Animal", "Apple", "lemon"] word_groups = { "apple": ["apple", "Apple", "Apple"], "animal": ["animal", "ANIMAL", "Animal"], "lemon": ["lemon", "Lemon", "lemon"], } groups1 = Grouper(words1, key=str.lower) groups2 = Grouper(words2, key=str.lower) self.assertEqual(dict(groups1 + groups2), word_groups) groups3 = Grouper(words2, key=str.upper) with self.assertRaises(ValueError): groups1 + groups3 # Can't concatenate groups with different keys
def system_construct(self): self.locals = Grouper( self._system_construct(self.options, self.params, self.configs)) self.state.clear() for k in self.configs.keys(): self.state[k] = SystemVar(self.locals[k], k) self.observables.clear() self.observables.update(self.locals) self.baselines.clear() self.baselines.update( {'mesh': fieldops.get_global_var_data(self.locals.mesh)}) if hasattr(self.locals, 'obsVars'): self._fig = QuickFig(*self.locals.obsVars) else: self._fig = QuickFig(self.state[0])
def test_init_accepts_mapping(self): dictionary = { "apple": ["Apple", "apple"], "lemon": ["lemon"], } groups = Grouper(dictionary, key=str.lower) self.assertEqual(dict(groups), dictionary)
def setup_tables(self, full_table, bad_tables, good_tables, **kwargs): Basic.setup_tables(self, full_table, bad_tables, good_tables, **kwargs) self.grouper = Grouper(full_table, self) self.SCORE_ID = add_meta_column( chain([full_table], bad_tables, good_tables), 'SCOREVAR' )
def job(grouperNum, chunksQueue, listSaveStateNameGrouper, listListLastCallNum): print 'Starting worker ' + str(grouperNum) while True: # Get new chunck to process chunk = chunksQueue.get() # Work print 'Worker ' + str(grouperNum) + ' mapping chunk ' + str(chunk) MapIterator = MapChunkIterator(mapChunksNameGenerator(chunk)) # Iterator to iterate through the chunck theContext = MapContext(groupChunksNameGenerator(chunk),MapIterator) Mapper.map(theContext) print 'Worker ' + str(grouperNum) + ' grouping locally chunck ' + str(chunk) idx = listListLastCallNum[grouperNum]+1 theGrouper = Grouper(grouperNum,idx,idx-1,directory); listSaveStateNameGrouper[grouperNum] = theGrouper.group(theContext) listListLastCallNum[grouperNum] = idx ; # "Close" chunk chunksQueue.task_done()
def test_strings(self): words = ["Apple", "animal", "apple", "ANIMAL", "animal"] word_groups = { "apple": ["Apple", "apple"], "animal": ["animal", "ANIMAL", "animal"], } groups = Grouper(words, key=str.lower) self.assertEqual(dict(groups), word_groups)
def tandem_grouper(bed, blast_list, tandem_Nmax=10, flip=True): if not flip: simple_blast = [(b.query, (b.sseqid, b.si)) for b in blast_list if b.evalue < 1e-10] else: simple_blast = [(b.subject, (b.qseqid, b.qi)) for b in blast_list if b.evalue < 1e-10] simple_blast.sort() standems = Grouper() for name, hits in itertools.groupby(simple_blast, key=lambda x:x[0]): # these are already sorted. hits = [x[1] for x in hits] for ia, a in enumerate(hits[:-1]): b = hits[ia + 1] # on the same chromosome and rank difference no larger than tandem_Nmax if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]: standems.join(a[1], b[1]) return standems
def merge_clusters(chain, clusters): # there are, in general, two kinds of breakpoints # those that are induced by inversions, and those by translocations # inversion-breakpoints are excessive breakpoints that I want to remove chain_num = len(chain) mergeables = Grouper() # disjoint sets of clusters that can be merged for j in xrange(chain_num): cj = chain[j] mergeables.join(cj, cj) for i in xrange(j-1, -1, -1): ci = chain[i] del_x = distance_x(clusters[ci], clusters[cj]) if del_x > Nmax: continue del_y = distance_y(clusters[ci], clusters[cj]) if del_x + del_y > Nmax: continue mergeables.join(ci, cj) to_merge = {} for mergeable in mergeables: for m in mergeable: to_merge[m] = min(mergeables[m]) merged_chain = [] for c in chain: if to_merge[c]==c: # i.e. parent of mergeables merged_chain.append(c) # refresh clusters list, merge chains for k, v in to_merge.iteritems(): if to_merge[k]!=k: # i.e. not map to self clusters[v].extend(clusters[k]) # maintain the x-sort [cluster.sort() for cluster in clusters] # nothing is merged updated = (len(merged_chain) != chain_num) return merged_chain, updated
def load_geneorders(fp_gff): # load gene orders before any filtering fp_gff.seek(0) tandem = Grouper() print >>sys.stderr, "Read .genes file" # chromosome => gene_list in that chromosome chr_ranks = collections.defaultdict(list) ranks = {} # gene => rank postion for row in fp_gff: chr, gene, start, stop = row.split() start = int(start) chr_ranks[chr].append((start, gene, chr)) tandem.join(gene) for v in chr_ranks.itervalues(): gene_rank = 0 for start, gene, chr in sorted(v): ranks[gene] = (chr, gene_rank) gene_rank += 1 return ranks, tandem
def load_geneorders(fp_gff): # load gene orders before any filtering fp_gff.seek(0) tandem = Grouper() print >> sys.stderr, "Read .genes file" # chromosome => gene_list in that chromosome chr_ranks = collections.defaultdict(list) ranks = {} # gene => rank postion for row in fp_gff: chr, gene, start, stop = row.split() start = int(start) chr_ranks[chr].append((start, gene, chr)) tandem.join(gene) for v in chr_ranks.itervalues(): gene_rank = 0 for start, gene, chr in sorted(v): ranks[gene] = (chr, gene_rank) gene_rank += 1 return ranks, tandem
def single_linkage(points, xdist, ydist, N): # This is the core single linkage algorithm # this behaves in O(n) complexity: we iterate through the pairs, for each pair # we look back on the adjacent pairs to find links clusters = Grouper() n = len(points) points.sort() for i in xrange(n): for j in xrange(i-1, -1, -1): # x-axis distance del_x = points[i][0]-points[j][0] if del_x > xdist: break # y-axis distance del_y = points[i][1]-points[j][1] if abs(del_y) > ydist: continue # otherwise join clusters.join(points[i], points[j]) clusters = [cluster for cluster in list(clusters) if score(cluster)>=N] return clusters
def mergeable(group1, group2, all_ranks, quota): # rule no.1- respect quota # rule no.2- but only count close genes once micro_grouper = Grouper() # data structure to check rule no.2 merged_group = group1 + group2 # attempted merge nmerged = len(merged_group) # do all pairwise comparisons to find closely located genes # TODO: silly implementation, not efficient for i, genei in enumerate(merged_group): speciesi, chri, posi = all_ranks[genei] micro_grouper.join(genei) for j in xrange(i+1, nmerged): genej = merged_group[j] speciesj, chrj, posj = all_ranks[genej] if speciesi==speciesj and chri==chrj and abs(posi-posj)<=Tandem_Nmax/2: micro_grouper.join(genei, genej) species_count = collections.defaultdict(int) # data structure to check rule no.1 for gene_group in micro_grouper: species = all_ranks[gene_group[0]][0] species_count[species] += 1 for species, count in species_count.items(): if count>quota[species]: return False return True
def check(self): print "\n** CHECKING", self.file, "**" records = parse_db(self.file) grouper = Grouper() #Check for consistancy in whether PV macros are followed by colons colon = None for r in records.keys(): if colon is None: n = self.remove_macro(r, False) colon = n.startswith(':') else: n = self.remove_macro(r, False) if n.startswith(':') != colon: if colon: self.errors.append("FORMAT ERROR: " + r + " should have a colon after the macro") else: self.errors.append("FORMAT ERROR: " + r + " should not have a colon after the macro") groups = grouper.group_records(records) if self.debug: for s in groups.keys(): print s, groups[s].RB, groups[s].SP, groups[s].SP_RBV for s in groups.keys(): self.check_case(groups[s]) self.check_chars(groups[s]) self.check_candidates(groups[s], records) for w in self.warnings: print w for e in self.errors: print e print "** WARNING COUNT =", len(self.warnings), "**" print "** ERROR COUNT =", len(self.errors), "**"
def test_add_and_group_for_methods(self): names = ["Trey Hunner", "Monica Marshall", "Katherine Hunner"] def last_name(name): return name.rsplit()[-1] name_groups = Grouper(names, key=last_name) self.assertEqual(name_groups.group_for("Rose Hunner"), "Hunner") self.assertEqual(name_groups.group_for("Rose Klyce"), "Klyce") self.assertEqual( name_groups['Hunner'], ["Trey Hunner", "Katherine Hunner"], ) name_groups.add('Rose Hunner') self.assertEqual( name_groups['Hunner'], ["Trey Hunner", "Katherine Hunner", "Rose Hunner"], ) name_groups.add("Rose Klyce") self.assertEqual(name_groups['Klyce'], ["Rose Klyce"])
def single_linkage(points, max_dist=Nmax, min_cluster_size=N): """ points are (x-index, y-index, cscore) per chromosome pair. """ # This is the core single linkage algorithm # this behaves in O(n) complexity: we iterate through the pairs, for each pair # we look back on the adjacent pairs to find links clusters = Grouper() n = len(points) points.sort() for i in xrange(n): for j in xrange(i-1, -1, -1): # x-axis distance del_x = points[i][0]-points[j][0] if del_x > max_dist: break # y-axis distance del_y = points[i][1]-points[j][1] if del_x + abs(del_y) > max_dist: continue #if abs(del_y) > Nmax: continue # otherwise join clusters.join(points[i], points[j]) clusters = [cluster for cluster in list(clusters) if len(cluster)>=min_cluster_size] return clusters
def test_test_tuples_of_strings(self): animals = [ ('agatha', 'dog'), ('kurt', 'cat'), ('margaret', 'mouse'), ('cory', 'cat'), ('mary', 'mouse'), ] animals_by_type = { 'mouse': [('margaret', 'mouse'), ('mary', 'mouse')], 'dog': [('agatha', 'dog')], 'cat': [('kurt', 'cat'), ('cory', 'cat')], } groups = Grouper(animals, key=itemgetter(1)) self.assertEqual(dict(groups), animals_by_type)
def __init__(self): # Current call stack self.call_stack = ['__main__'] # A mapping of which function called which other function self.call_dict = defaultdict(lambda: defaultdict(int)) # Counters for each function self.func_count = defaultdict(int) self.func_count_max = 0 self.call_stack_timer = [] self.previous_event_return = False # Accumulative time per function self.func_time = defaultdict(float) self.func_time_max = 0 self.trace_grouper = Grouper() self.painter = GraphvizOutput() self.painter.processor = self
def construct(self): constructed = self._construct(p=self.inputs) localObj = Grouper(constructed) del localObj['p'] try: del localObj['self'] except KeyError: pass self._construct_check(localObj) self._locals = localObj self._stateVars = list() for k in self.configs.keys(): var = self._locals[k] if isinstance(var, np.ndarray): var = SwarmVar(var, k, self.inputs) else: var = GlobeVar(var, k, self.inputs) self._stateVars.append(var)
def test_custom_update_method(self): words = ["Apple", "animal", "apple", "ANIMAL", "animal"] word_groups = { "apple": ["Apple", "apple", "APPLE", "APPLE"], "animal": ["animal", "ANIMAL", "animal"], "lemon": ["lemon", "Lemon", "lemon", "LEMON"], "orange": ["Orange"], } more_items = { "apple": ["APPLE"], "lemon": ["lemon", "LEMON"], "orange": ["Orange"], } groups = Grouper(words, key=str.lower) groups.update(["lemon", "Lemon", "APPLE"]) groups.update(more_items) self.assertEqual(dict(groups), word_groups)
def mergedSpeakers(chat): gps = Grouper() for comment in chat: if comment.thread > 0: spkr = comment.name gps.join(comment.name, comment.name) for ment in comment.mentioned: gps.join(comment.name, ment) gpToNum = {} for ctr, gp in enumerate(gps): gpToNum[tuple(gp)] = ctr + 1 for comment in chat: if comment.thread > 0: gp = gps.find(comment.name) num = gpToNum[tuple(gp)] comment.thread = num return chat
def __init__(self, **kwargs): si = self._sortedInputKeys['options'] options = Grouper(OrderedDict([(k, self.inputs[k]) for k in si])) si = self._sortedInputKeys['params'] params = Grouper(OrderedDict([(k, self.inputs[k]) for k in si])) si = self._sortedGhostKeys['configs'] configs = Grouper(OrderedDict([(k, self.ghosts[k]) for k in si])) dOptions = options.copy() dOptions['hash'] = options.hashID dParams = params.copy() dParams['hash'] = params.hashID # self._systemObserverClasses = self.ghosts['observers'] super().__init__(options=dOptions, params=dParams, supertype='System', **kwargs) self.params, self.options = params, options
def make_family(gene_pairs, all_ranks, quota): print >>sys.stderr, "... gene family clustering started" g = Grouper() gene_pairs.sort(reverse=True) #pprint.pprint(gene_pairs[:10]) for synteny_score, gene1, gene2 in gene_pairs: # attempt to join the two genes g.join(gene1) g.join(gene2) group1, group2 = g[gene1], g[gene2] if mergeable(group1, group2, all_ranks, quota): g.join(gene1, gene2) return g
def merge_clusters(chain, clusters): # there are, in general, two kinds of breakpoints # those that are induced by inversions, and those by translocations # inversion-breakpoints are excessive breakpoints that I want to remove chain_num = len(chain) mergeables = Grouper() # disjoint sets of clusters that can be merged for j in xrange(chain_num): cj = chain[j] mergeables.join(cj, cj) for i in xrange(j - 1, -1, -1): ci = chain[i] del_x = distance_x(clusters[ci], clusters[cj]) if del_x > Nmax: continue del_y = distance_y(clusters[ci], clusters[cj]) if del_x + del_y > Nmax: continue mergeables.join(ci, cj) to_merge = {} for mergeable in mergeables: for m in mergeable: to_merge[m] = min(mergeables[m]) merged_chain = [] for c in chain: if to_merge[c] == c: # i.e. parent of mergeables merged_chain.append(c) # refresh clusters list, merge chains for k, v in to_merge.iteritems(): if to_merge[k] != k: # i.e. not map to self clusters[v].extend(clusters[k]) # maintain the x-sort [cluster.sort() for cluster in clusters] # nothing is merged updated = (len(merged_chain) != chain_num) return merged_chain, updated
class MR(Basic): def __init__(self, *args, **kwargs): Basic.__init__(self, *args, **kwargs) self.best = [] self.max_wait = kwargs.get('max_wait', 2 * 60 * 60) # 2 hours self.start = None self.stop = False self.n_rules_checked = 0 self.naive = kwargs.get('naive', False) self.max_bests = 50 self.max_complexity = kwargs.get('max_complexity', 3) self.checkpoints = [] self.cost_clique = 0 def __hash__(self): components = [ self.__class__.__name__, str(self.aggerr.__class__.__name__), str(set(self.cols)), self.epsilon, self.tau, self.p, self.err_func.__class__.__name__, self.tablename, self.aggerr.keys, self.max_wait, self.c_range ] components = map(str, components) return hash('\n'.join(components)) def setup_tables(self, full_table, bad_tables, good_tables, **kwargs): Basic.setup_tables(self, full_table, bad_tables, good_tables, **kwargs) self.grouper = Grouper(full_table, self) self.SCORE_ID = add_meta_column( chain([full_table], bad_tables, good_tables), 'SCOREVAR') def set_params(self, **kwargs): self.cols = kwargs.get('cols', self.cols) self.params.update(kwargs) self.good_thresh = 0.0001 self.granularity = kwargs.get('granularity', self.granularity) def make_rules(self, cur_groups): if cur_groups == None: new_groups = self.grouper.initial_groups() else: new_groups = self.grouper.merge_groups(cur_groups) rules = {} for attrs, groups in new_groups: start = time.time() for ro in self.grouper(attrs, groups): if self.max_wait: self.n_rules_checked -= len(ro.rule.filter.conditions) if self.n_rules_checked <= 0: diff = time.time() - self.start if not self.checkpoints or diff - self.checkpoints[-1][ 0] > 10: if self.best: best_rule = max(self.best).rule self.checkpoints.append((diff, best_rule)) self.stop = diff > self.max_wait self.n_rules_checked = 1000 if self.stop: _logger.debug("wait %d > %d exceeded." % (diff, self.max_wait)) return yield attrs, ro # print "group by\t%s\t%.4f" % (str([attr.name for attr in attrs]), time.time()-start) def __call__(self, full_table, bad_tables, good_tables, **kwargs): self.setup_tables(full_table, bad_tables, good_tables, **kwargs) self.update_status("running bottom up algorithm") for pairs in self.find_cliques(): rules = [(b.rule, iteridx) for b, iteridx in pairs] yield rules self.update_status("bottom up algorithm done") def find_cliques(self): """ table has been trimmed of extraneous columns. """ #clusters = self.load_from_cache() #if clusters is not None: #yield clusters #return rules = None self.best = [] self.start = time.time() added = [] nseen = 0 niters = 0 while (niters < self.max_complexity and not self.stop and (rules is None or rules)): niters += 1 self.update_status("running bottomup iter %d" % niters) _logger.debug("=========iter %d=========", niters) nadded = 0 seen = set() nnewgroups = 0 new_rules = defaultdict(list) # for each combination of attributes # prune the groups that are less influential than the parent group's # for attr, ro in self.make_rules(rules): nseen += 1 if nseen % 50 == 0 and nseen > 0: self.update_status("bottomup processed %d rules" % nseen) if self.stop: break if self.top_k(ro): nadded += 1 if self.naive: new_rules[attr] = [None] nnewgroups += 1 elif self.prune_rule(ro): new_rules[attr].append(ro.group) nnewgroups += 1 ro.rule.__examples__ = None if nadded % 25 == 0 and nadded > 0: newbests = filter(lambda c: c not in seen, self.best) seen.update(self.best) yield zip(newbests, [niters] * len(newbests)) newbests = filter(lambda c: c not in seen, self.best) seen.update(self.best) yield zip(newbests, [niters] * len(newbests)) if not nadded: break rules = new_rules if niters == 1: best = self.best else: best = set(self.best) if prev_best and prev_best in self.best: self.best.remove(prev_best) best = list(best) self.best = [max(self.best)] if self.best else [] prev_best = max(self.best) if self.best else None _logger.debug("finished, merging now") self.cost_clique = time.time() - self.start #self.cache_results(clusters) def prune_rule(self, ro): if ro.npts < self.min_pts: _logger.debug("prune? %s\t%s", 'FALSE', str(ro)) return False if (math.isnan(ro.bad_inf) or math.isnan(ro.good_inf) or math.isnan(ro.inf)): _logger.debug("prune? %s\t%s", 'FALSE', str(ro)) return False # assuming the best case (the good_stat was zero) # would the influence beat the best so far across # the full c_range? if self.best: if ro.dominated_by(max(self.best)): _logger.debug("prune? %s\t%s", 'FALSE', str(ro)) return False #if self.best and ro.best_inf <= max(self.best).inf: # # if best tuple influence < rule influence: # if ro.best_tuple_inf <= max(self.best).inf: # _logger.debug("%s\t%s", 'FALSE', str(ro)) # return False # check max good influence if False and ro.good_inf < self.good_thresh: # TODO: can skip computing good_stats ro.good_skip = True #_logger.debug("%s\t%.4f\t%s", 'T', self.best and max(self.best).inf or 0, str(ro)) return True def top_k(self, ro): n = 0 best = self.best and max(self.best, key=lambda ro: ro.inf) or None if len(self.best) >= self.max_bests: bound = best.inf - self.best[0].inf thresh = self.best[0].inf + bound * 0.02 if ro.inf <= thresh: return False if ro in self.best: return False if math.isnan(ro.inf): return False if len(self.best) < self.max_bests: n += 1 _logger.debug(str(ro)) heapq.heappush(self.best, ro) else: n += 1 _logger.debug(str(ro)) heapq.heapreplace(self.best, ro) best = best and max(best, ro) or ro return True @instrument def load_from_cache(self): import bsddb as bsddb3 self.cache = bsddb3.hashopen('./dbwipes.mr.cache') try: myhash = str(hash(self)) if myhash in self.cache and self.use_cache: self.update_status("loading partitions from cache") dicts, errors = json.loads(self.cache[myhash]) clusters = map(Cluster.from_dict, dicts) for c in clusters: self.influence_cluster(c, self.full_table) return clusters except Exception as e: print e pdb.set_trace() pass finally: self.cache.close() return None @instrument def cache_results(self, clusters): import bsddb as bsddb3 # save the clusters in a dictionary if self.use_cache: myhash = str(hash(self)) self.cache = bsddb3.hashopen('./dbwipes.mr.cache') try: dicts = [c.to_dict() for c in clusters] errors = [c.error for c in clusters] self.cache[myhash] = json.dumps((dicts, errors)) except Exception as e: print e pdb.set_trace() pass finally: self.cache.close()
chunksQueue.join() print 'All workers have finished.' print 'Mapping and local grouping done. {} chuncks grouped by {} threads.'.format(cf.nChunks, totalNumberOfGrouper) ################ # Global group # ################ print '------------------' print('Global grouping...') print '------------------' listOfDirectory = [] globalGrouperDirectory = '/Users/lcambier/TempMapReduce/mapper_and_groupper_logs2/' for i in range(0,totalNumberOfGrouper): listOfDirectory.append('/Users/lcambier/TempMapReduce/mapper_and_groupper_logs2/') globalDict = Grouper.globalGrouper(saveStateNameGrouper,listGrouperNum,listLastCallNum,listOfDirectory,globalGrouperDirectory) print('Global grouping done.') ############ # Reducing # ############ print '------------' print('Reducing ...') print '------------' outputDict = dict() for key, globalNodeFileName in globalDict.iteritems(): reduceIterator = ReduceFromGroupIterator(globalNodeFileName) theReduceContext = ReduceContext(key,reduceIterator) outputDict[key] = Reducer.reduce(theReduceContext) print('Reducing done.')
class MR(Basic): def __init__(self, *args, **kwargs): Basic.__init__(self, *args, **kwargs) self.best = [] self.max_wait = kwargs.get('max_wait', 2 * 60 * 60) # 2 hours self.start = None self.stop = False self.n_rules_checked = 0 self.naive = kwargs.get('naive', False) self.max_bests = 50 self.max_complexity = kwargs.get('max_complexity', 3) self.checkpoints = [] self.cost_clique = 0 def __hash__(self): components = [ self.__class__.__name__, str(self.aggerr.__class__.__name__), str(set(self.cols)), self.epsilon, self.tau, self.p, self.err_func.__class__.__name__, self.tablename, self.aggerr.keys, self.max_wait, self.c_range ] components = map(str, components) return hash('\n'.join(components)) def setup_tables(self, full_table, bad_tables, good_tables, **kwargs): Basic.setup_tables(self, full_table, bad_tables, good_tables, **kwargs) self.grouper = Grouper(full_table, self) self.SCORE_ID = add_meta_column( chain([full_table], bad_tables, good_tables), 'SCOREVAR' ) def set_params(self, **kwargs): self.cols = kwargs.get('cols', self.cols) self.params.update(kwargs) self.good_thresh = 0.0001 self.granularity = kwargs.get('granularity', self.granularity) def make_rules(self, cur_groups): if cur_groups == None: new_groups = self.grouper.initial_groups() else: new_groups = self.grouper.merge_groups(cur_groups) rules = {} for attrs, groups in new_groups: start = time.time() for ro in self.grouper(attrs, groups): if self.max_wait: self.n_rules_checked -= len(ro.rule.filter.conditions) if self.n_rules_checked <= 0: diff = time.time() - self.start if not self.checkpoints or diff - self.checkpoints[-1][0] > 10: if self.best: best_rule = max(self.best).rule self.checkpoints.append((diff, best_rule)) self.stop = diff > self.max_wait self.n_rules_checked = 1000 if self.stop: _logger.debug("wait %d > %d exceeded." % (diff, self.max_wait)) return yield attrs, ro # print "group by\t%s\t%.4f" % (str([attr.name for attr in attrs]), time.time()-start) def __call__(self, full_table, bad_tables, good_tables, **kwargs): self.setup_tables(full_table, bad_tables, good_tables, **kwargs) self.update_status("running bottom up algorithm") for pairs in self.find_cliques(): rules = [(b.rule, iteridx) for b, iteridx in pairs] yield rules self.update_status("bottom up algorithm done") def find_cliques(self): """ table has been trimmed of extraneous columns. """ #clusters = self.load_from_cache() #if clusters is not None: #yield clusters #return rules = None self.best = [] self.start = time.time() added = [] nseen = 0 niters = 0 while (niters < self.max_complexity and not self.stop and (rules is None or rules)): niters += 1 self.update_status("running bottomup iter %d" % niters) _logger.debug("=========iter %d=========", niters) nadded = 0 seen = set() nnewgroups = 0 new_rules = defaultdict(list) # for each combination of attributes # prune the groups that are less influential than the parent group's # for attr, ro in self.make_rules(rules): nseen += 1 if nseen % 50 == 0 and nseen > 0: self.update_status("bottomup processed %d rules" % nseen) if self.stop: break if self.top_k(ro): nadded += 1 if self.naive: new_rules[attr] = [None] nnewgroups += 1 elif self.prune_rule(ro): new_rules[attr].append(ro.group) nnewgroups += 1 ro.rule.__examples__ = None if nadded % 25 == 0 and nadded > 0: newbests = filter(lambda c: c not in seen, self.best) seen.update(self.best) yield zip(newbests, [niters]*len(newbests)) newbests = filter(lambda c: c not in seen, self.best) seen.update(self.best) yield zip(newbests, [niters]*len(newbests)) if not nadded: break rules = new_rules if niters == 1: best = self.best else: best = set(self.best) if prev_best and prev_best in self.best: self.best.remove(prev_best) best = list(best) self.best = [max(self.best)] if self.best else [] prev_best = max(self.best) if self.best else None _logger.debug("finished, merging now") self.cost_clique = time.time() - self.start #self.cache_results(clusters) def prune_rule(self, ro): if ro.npts < self.min_pts: _logger.debug("prune? %s\t%s", 'FALSE', str(ro)) return False if (math.isnan(ro.bad_inf) or math.isnan(ro.good_inf) or math.isnan(ro.inf)): _logger.debug("prune? %s\t%s", 'FALSE', str(ro)) return False # assuming the best case (the good_stat was zero) # would the influence beat the best so far across # the full c_range? if self.best: if ro.dominated_by(max(self.best)): _logger.debug("prune? %s\t%s", 'FALSE', str(ro)) return False #if self.best and ro.best_inf <= max(self.best).inf: # # if best tuple influence < rule influence: # if ro.best_tuple_inf <= max(self.best).inf: # _logger.debug("%s\t%s", 'FALSE', str(ro)) # return False # check max good influence if False and ro.good_inf < self.good_thresh: # TODO: can skip computing good_stats ro.good_skip = True #_logger.debug("%s\t%.4f\t%s", 'T', self.best and max(self.best).inf or 0, str(ro)) return True def top_k(self, ro): n = 0 best = self.best and max(self.best, key=lambda ro: ro.inf) or None if len(self.best) >= self.max_bests: bound = best.inf - self.best[0].inf thresh = self.best[0].inf + bound * 0.02 if ro.inf <= thresh: return False if ro in self.best: return False if math.isnan(ro.inf): return False if len(self.best) < self.max_bests: n += 1 _logger.debug(str(ro)) heapq.heappush(self.best, ro) else: n += 1 _logger.debug(str(ro)) heapq.heapreplace(self.best, ro) best = best and max(best, ro) or ro return True @instrument def load_from_cache(self): import bsddb as bsddb3 self.cache = bsddb3.hashopen('./dbwipes.mr.cache') try: myhash = str(hash(self)) if myhash in self.cache and self.use_cache: self.update_status("loading partitions from cache") dicts, errors = json.loads(self.cache[myhash]) clusters = map(Cluster.from_dict, dicts) for c in clusters: self.influence_cluster(c, self.full_table) return clusters except Exception as e: print e pdb.set_trace() pass finally: self.cache.close() return None @instrument def cache_results(self, clusters): import bsddb as bsddb3 # save the clusters in a dictionary if self.use_cache: myhash = str(hash(self)) self.cache = bsddb3.hashopen('./dbwipes.mr.cache') try: dicts = [c.to_dict() for c in clusters] errors = [c.error for c in clusters] self.cache[myhash] = json.dumps((dicts, errors)) except Exception as e: print e pdb.set_trace() pass finally: self.cache.close()
class System(Observable, Chroner, Wanderer): def __init__(self, **kwargs): si = self._sortedInputKeys['options'] options = Grouper(OrderedDict([(k, self.inputs[k]) for k in si])) si = self._sortedInputKeys['params'] params = Grouper(OrderedDict([(k, self.inputs[k]) for k in si])) si = self._sortedGhostKeys['configs'] configs = Grouper(OrderedDict([(k, self.ghosts[k]) for k in si])) dOptions = options.copy() dOptions['hash'] = options.hashID dParams = params.copy() dParams['hash'] = params.hashID # self._systemObserverClasses = self.ghosts['observers'] super().__init__(options=dOptions, params=dParams, supertype='System', **kwargs) self.params, self.options = params, options def system_construct(self): self.locals = Grouper( self._system_construct(self.options, self.params, self.configs)) self.state.clear() for k in self.configs.keys(): self.state[k] = SystemVar(self.locals[k], k) self.observables.clear() self.observables.update(self.locals) self.baselines.clear() self.baselines.update( {'mesh': fieldops.get_global_var_data(self.locals.mesh)}) if hasattr(self.locals, 'obsVars'): self._fig = QuickFig(*self.locals.obsVars) else: self._fig = QuickFig(self.state[0]) @property def constructed(self): return hasattr(self, 'locals') def _configurable_changed_state_hook(self): for k, v in self.configs.items(): if v is Ellipsis: self.state[k].var.data[...] = self.state[k]._initialData def _voyager_changed_state_hook(self): super()._voyager_changed_state_hook() assert self.constructed for var in self.state: var.update() self.locals.solve() def _iterate(self): dt = self.locals.integrate() self.indices.chron.value += dt super()._iterate() def _out(self): outs = super()._out() add = self.evaluate() outs.update(add) return outs def _evaluate(self): if hasattr(self, 'locals'): add = {vn: mut.data for vn, mut in self.state.items()} else: add = {vn: OutsNull for vn in self.configs.keys()} return add def _save(self): super()._save() self.writer.add_dict(self.baselines, 'baselines') @_system_construct_if_necessary def _load_process(self, outs): outs = super()._load_process(outs) for key, mut in self.state.items(): mut.mutate(outs.pop(key)) return outs @property def fig(self): if self.indices.isnull or not hasattr(self, 'locals'): raise Exception("Nothing to show yet.") return self._fig def show(self): self.fig.show() def _observation_mode_hook(self): if self.indices.isnull: self.initialise() super()._observation_mode_hook()
def test_lookups(self): words = ["Apple", "animal", "apple", "ANIMAL", "animal"] groups = Grouper(words, key=str.lower) self.assertEqual(groups['apple'], ["Apple", "apple"])
def test_containment(self): words = ["Apple", "animal", "apple", "ANIMAL", "animal"] groups = Grouper(words, key=str.lower) self.assertIn('apple', groups)
down_wall = Wall() current_cell.down_wall = down_wall right_wall = Wall() current_cell.right_wall = right_wall if y != Y-1: down_wall.neighbours = (current_cell, cells[(x,y+1)]) walls.append(down_wall) if x != X-1: right_wall.neighbours = (current_cell, cells[(x+1,y)]) walls.append(right_wall) cell_list = [cells[key] for key in cells] maze = Grouper(cell_list) for _ in range(len(walls)): wall = popchoice(walls) cell_1, cell_2 = wall.neighbours if not maze.joined(cell_1, cell_2): wall.active = False maze.join(cell_1, cell_2) maze_map = [] x_max = (X*2)+1
def test_no_iterable_given(self): groups = Grouper(key=str.lower) self.assertEqual(dict(groups), {})
def setup_tables(self, full_table, bad_tables, good_tables, **kwargs): Basic.setup_tables(self, full_table, bad_tables, good_tables, **kwargs) self.grouper = Grouper(full_table, self) self.SCORE_ID = add_meta_column( chain([full_table], bad_tables, good_tables), 'SCOREVAR')
class MR(Basic): def __init__(self, *args, **kwargs): Basic.__init__(self, *args, **kwargs) self.best = [] self.max_wait = kwargs.get('max_wait', 2 * 60 * 60) # 2 hours self.start = None self.stop = False self.n_rules_checked = 0 self.naive = kwargs.get('naive', False) self.max_bests = 50 self.max_complexity = kwargs.get('max_complexity', 3) self.checkpoints = [] self.cost_clique = 0 def __hash__(self): components = [ self.__class__.__name__, str(self.aggerr.__class__.__name__), str(set(self.cols)), self.epsilon, self.tau, self.p, self.err_func.__class__.__name__, self.tablename, self.aggerr.keys, self.max_wait, self.c_range ] components = map(str, components) return hash('\n'.join(components)) def setup_tables(self, full_table, bad_tables, good_tables, **kwargs): Basic.setup_tables(self, full_table, bad_tables, good_tables, **kwargs) self.grouper = Grouper(full_table, self) self.SCORE_ID = add_meta_column( chain([full_table], bad_tables, good_tables), 'SCOREVAR' ) def set_params(self, **kwargs): self.cols = kwargs.get('cols', self.cols) self.params.update(kwargs) self.max_bad_inf = -1e1000000 self.good_thresh = 0.0001 self.granularity = kwargs.get('granularity', self.granularity) def make_rules(self, cur_groups): if cur_groups == None: new_groups = self.grouper.initial_groups() else: new_groups = self.grouper.merge_groups(cur_groups) rules = {} for attrs, groups in new_groups: start = time.time() for ro in self.grouper(attrs, groups): if self.max_wait: self.n_rules_checked -= len(ro.rule.filter.conditions) if self.n_rules_checked <= 0: diff = time.time() - self.start if not self.checkpoints or diff - self.checkpoints[-1][0] > 10: if self.best: best_rule = max(self.best, key=lambda r: r.inf).rule self.checkpoints.append((diff, best_rule)) self.stop = diff > self.max_wait self.n_rules_checked = 1000 if self.stop: _logger.debug("wait %d > %d exceeded." % (diff, self.max_wait)) return yield attrs, ro # print "group by\t%s\t%.4f" % (str([attr.name for attr in attrs]), time.time()-start) def __call__(self, full_table, bad_tables, good_tables, **kwargs): self.setup_tables(full_table, bad_tables, good_tables, **kwargs) clusters = self.find_cliques() clusters = self.merge_rules(clusters) clusters = filter(lambda c: r_vol(c.c_range), clusters) clusters.sort(reverse=True) self.all_clusters = self.final_clusters = clusters return clusters self.best.sort(reverse=True) return self.merge_rules(self.best) def find_cliques(self): """ table has been trimmed of extraneous columns. """ clusters = self.load_from_cache() if clusters is not None: return clusters rules = None self.opts_per_iter = [] self.best = [] self.start = time.time() nseen = 0 niters = 0 while niters < self.max_complexity and not self.stop and (rules is None or rules): niters += 1 _logger.debug("=========iter %d=========", niters) besthash = hash(tuple(self.best)) nadded = 0 nnewgroups = 0 new_rules = defaultdict(list) # for each combination of attributes # prune the groups that are less influential than the parent group's # for attr, ro in self.make_rules(rules): nseen += 1 if self.stop: break nadded += self.top_k((ro,)) if self.naive: new_rules[attr] = [None] nnewgroups += 1 elif self.prune_rule(ro): new_rules[attr].append(ro.group) nnewgroups += 1 ro.rule.__examples__ = None if nnewgroups % 10000 == 0: pass #print "# new groups\t", nnewgroups, '\t', time.time()-self.start, self.max_wait if not nadded: pass # break rules = new_rules if niters == 1: self.opts_per_iter.append(list(self.best)) else: self.opts_per_iter.append(list(self.best[1:])) if prev_best and prev_best in self.opts_per_iter[-1]: self.opts_per_iter[-1].remove(prev_best) self.best = [max(self.best)] if self.best else [] prev_best = max(self.best) if self.best else None _logger.debug("finished, merging now") self.cost_clique = time.time() - self.start ret = [] for bests in self.opts_per_iter: bests.sort(reverse=True) ret.extend(bests)# self.merge_rules(bests)) clusters = map(self.blah_to_cluster, ret) self.cache_results(clusters) return clusters def blah_to_cluster(self, blah): rule = blah.rule fill_in_rules([rule], self.full_table, self.cols) c = Cluster.from_rule(rule, self.cols) c.error = self.influence_cluster(c, self.full_table) return c def merge_rules(self, clusters): start = time.time() clusters = filter_bad_clusters(clusters) thresh = compute_clusters_threshold(clusters, nstds=0.) is_mergable = lambda c: c.error >= thresh is_mergable = lambda c: True influence_f = lambda c: self.influence_cluster(c, self.full_table) params = dict(self.params) params.update({ 'learner_hash': hash(self), 'cols' : self.cols, 'influence' : influence_f, 'is_mergable' : is_mergable, 'c_range': self.c_range, 'use_mtuples' : False, 'learner' : self, 'partitions_complete' : False }) self.merger = RangeMerger(**params) #self.merger = Merger(**params) self.final_clusters = self.merger(clusters) self.all_clusters = clusters self.cost_merge = time.time() - start self.costs = { 'cost_clique' : self.cost_clique, 'cost_merge' : self.cost_merge } return self.final_clusters def prune_rules(self, rules): ret = defaultdict(set) for key, ros in rules.iteritems(): for ro in ros: if self.prune_rule(ro): ret[key].add(ro) return ret def prune_rule(self, ro): # update bad influence bounds self.max_bad_inf = max(self.max_bad_inf, ro.bad_inf) self.bad_thresh = max(self.bad_thresh, 0.01 * self.max_bad_inf) if ro.npts < self.min_pts: _logger.debug("%s\t%s", 'FALSE', str(ro)) return False if (math.isnan(ro.bad_inf) or math.isnan(ro.good_inf) or math.isnan(ro.inf)): _logger.debug("%s\t%s", 'FALSE', str(ro)) return False # check min bad influence #if ro.bad_inf < self.bad_thresh: # return False # assuming the best case (the good_stat was zero) # would the influence beat the best so far across # the full c_range? if self.best: if ro.dominated_by(max(self.best)): return False #if self.best and ro.best_inf <= max(self.best).inf: # # if best tuple influence < rule influence: # if ro.best_tuple_inf <= max(self.best).inf: # _logger.debug("%s\t%s", 'FALSE', str(ro)) # return False # check max good influence if ro.good_inf < self.good_thresh: # TODO: can skip computing good_stats ro.good_skip = True #_logger.debug("%s\t%.4f\t%s", 'T', self.best and max(self.best).inf or 0, str(ro)) return True def top_k(self, rules): n = 0 best = self.best and max(self.best, key=lambda ro: ro.inf) or None for ro in rules: if len(self.best) >= self.max_bests: bound = best.inf - self.best[0].inf thresh = self.best[0].inf + bound * 0.02 if ro.inf <= thresh: continue if ro in self.best: continue if math.isnan(ro.inf): continue if not best or ro.inf > best.inf: n += 1 _logger.debug(str(ro)) if len(self.best) < self.max_bests: heapq.heappush(self.best, ro) else: heapq.heapreplace(self.best, ro) best = best and max(best, ro) or ro return n @instrument def load_from_cache(self): import bsddb as bsddb3 self.cache = bsddb3.hashopen('./dbwipes.mr.cache') try: myhash = str(hash(self)) if myhash in self.cache and self.use_cache: dicts, errors = json.loads(self.cache[myhash]) clusters = map(Cluster.from_dict, dicts) return clusters except Exception as e: print e pdb.set_trace() pass finally: self.cache.close() return None @instrument def cache_results(self, clusters): import bsddb as bsddb3 # save the clusters in a dictionary if self.use_cache: myhash = str(hash(self)) self.cache = bsddb3.hashopen('./dbwipes.mr.cache') try: dicts = [c.to_dict() for c in clusters] errors = [c.error for c in clusters] self.cache[myhash] = json.dumps((dicts, errors)) except Exception as e: print e pdb.set_trace() pass finally: self.cache.close()