Beispiel #1
0
def single_linkage(points, max_dist=Nmax, min_cluster_size=N):
    """
    points are (x-index, y-index, cscore) per chromosome pair.
    """
    # This is the core single linkage algorithm
    # this behaves in O(n) complexity: we iterate through the pairs, for each pair
    # we look back on the adjacent pairs to find links

    clusters = Grouper()
    n = len(points)
    points.sort()
    for i in xrange(n):
        for j in xrange(i - 1, -1, -1):
            # x-axis distance
            del_x = points[i][0] - points[j][0]
            if del_x > max_dist: break
            # y-axis distance
            del_y = points[i][1] - points[j][1]
            if del_x + abs(del_y) > max_dist: continue
            #if abs(del_y) > Nmax: continue
            # otherwise join
            clusters.join(points[i], points[j])
    clusters = [
        cluster for cluster in list(clusters)
        if len(cluster) >= min_cluster_size
    ]
    return clusters
def get_2D_overlap(chain, eclusters):
    """
    Implements a sweep line algorithm, that has better running time than naive O(n^2):
    assume block has x_ends, and y_ends for the bounds

    1. sort x_ends, and take a sweep line to scan the x_ends
    2. if left end, test y-axis intersection of current block with `active` set;
       also put this block in the `active` set
    3. if right end, remove block from the `active` set
    """
    mergeables = Grouper()
    active = set()

    x_ends = []
    for i, (range_x, range_y, score) in enumerate(eclusters):
        chr, left, right = range_x
        x_ends.append((chr, left, 0, i))  # 0/1 for left/right-ness
        x_ends.append((chr, right, 1, i))
    x_ends.sort()

    chr_last = ""
    for chr, pos, left_right, i in x_ends:
        if chr != chr_last: active.clear()
        if left_right==0: 
            active.add(i) 
            for x in active:
                # check y-overlap
                if range_overlap(eclusters[x][1], eclusters[i][1]):
                    mergeables.join(x, i)
        else: # right end
            active.remove(i) 

        chr_last = chr

    return mergeables
def generate_sim_records(records, sim_record_name, dis_record_name):      
    sim_prefix = sim_record_name + ':'
    
    grouper = Grouper()
    groups = grouper.group_records(records)
    
    output = ""

    for g in groups.keys():
        sim_record_name = get_sim_name(groups[g].main)
        
        #Check sim record does not already exist - maybe someone started writing the records but got bored!
        if sim_record_name in records:
            continue
            
        #Skip record if it is a simulation record 
        if groups[g].main.startswith(sim_prefix):
            continue
            
        #Skip adding sim record if the original is a soft record
        if records[groups[g].main].dtyp is None or records[groups[g].main].dtyp.lower() == "soft channel":
            continue

        #No point simulating SIM or DISABLE
        if groups[g].RB != sim_record_name and groups[g].RB != dis_record_name:
            typ = records[groups[g].main].type
            #Don't add simulation record unless the type is suitable
            if typ in ALLOWED_SIM_TYPES:            
                print "ADDED SIM RECORD =", sim_record_name
                output += generate_record_text(records[groups[g].main], groups[g].RB, groups[g].SP, groups[g].SP_RBV)
                
    return output
Beispiel #4
0
def get_2D_overlap(chain, eclusters):
    """
    Implements a sweep line algorithm, that has better running time than naive O(n^2):
    assume block has x_ends, and y_ends for the bounds

    1. sort x_ends, and take a sweep line to scan the x_ends
    2. if left end, test y-axis intersection of current block with `active` set;
       also put this block in the `active` set
    3. if right end, remove block from the `active` set
    """
    mergeables = Grouper()
    active = set()

    x_ends = []
    for i, (range_x, range_y, score) in enumerate(eclusters):
        chr, left, right = range_x
        x_ends.append((chr, left, 0, i))  # 0/1 for left/right-ness
        x_ends.append((chr, right, 1, i))
    x_ends.sort()

    chr_last = ""
    for chr, pos, left_right, i in x_ends:
        if chr != chr_last: active.clear()
        if left_right == 0:
            active.add(i)
            for x in active:
                # check y-overlap
                if range_overlap(eclusters[x][1], eclusters[i][1]):
                    mergeables.join(x, i)
        else:  # right end
            active.remove(i)

        chr_last = chr

    return mergeables
Beispiel #5
0
def group():
    """run the grouper"""
    logging.debug("Start grouping")
    groupi = Grouper()
    # pass True to use mapreduce and False to use combine
    groupi.run( True )
    logging.debug("Finished grouping")
Beispiel #6
0
def find_synteny_region(query, sbed, data, window, cutoff, colinear=True):
    # get all synteny blocks for a query, algorithm is single linkage
    # anchors are a window centered on query
    # two categories of syntenic regions depending on what query is:
    # (Syntelog): syntenic region is denoted by the syntelog
    # (Gray gene): syntenic region is marked by the closest flanker

    regions = []
    ysorted = sorted(data, key=lambda x: x[1])
    g = Grouper()

    a, b = itertools.tee(ysorted)
    next(b, None)
    for ia, ib in itertools.izip(a, b):
        pos1, pos2 = ia[1], ib[1]
        if pos2 - pos1 < window and sbed[pos1].seqid == sbed[pos2].seqid:
            g.join(ia, ib)

    for group in sorted(g):
        (qflanker,
         syntelog), (far_flanker,
                     far_syntelog), flanked = get_flanker(group, query)

        # y-boundary of the block
        gs = [x[1] for x in group]
        left, right = min(gs), max(gs)

        # run a mini-dagchainer here, take the direction that gives us most anchors
        orientation = "+"
        if colinear:
            y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)]
            lis = longest_increasing_subsequence(y_indexed_group)
            lds = longest_decreasing_subsequence(y_indexed_group)

            if len(lis) >= len(lds):
                track = lis
            else:
                track = lds
                orientation = "-"

            group = [group[i] for (y, i) in track]

        xpos, ypos = zip(*group)
        score = min(len(set(xpos)), len(set(ypos)))

        if qflanker == query:
            gray = "S"
        else:
            gray = "G" if not flanked else "F"
            score -= 1  # slight penalty for not finding syntelog

        if score < cutoff: continue

        # this characterizes a syntenic region (left, right). syntelog is -1 if it's a gray gene
        syn_region = (syntelog, left, right, gray, orientation, score)
        regions.append(syn_region)

    return sorted(regions, key=lambda x: -x[-1])  # decreasing synteny score
def find_synteny_region(query, sbed, data, window, cutoff, colinear=True):
    # get all synteny blocks for a query, algorithm is single linkage
    # anchors are a window centered on query
    # two categories of syntenic regions depending on what query is:
    # (Syntelog): syntenic region is denoted by the syntelog
    # (Gray gene): syntenic region is marked by the closest flanker

    regions = []
    ysorted = sorted(data, key=lambda x:x[1])
    g = Grouper()

    a, b = itertools.tee(ysorted)
    next(b, None)
    for ia, ib in itertools.izip(a, b):
        pos1, pos2 = ia[1], ib[1]
        if pos2 - pos1 < window and sbed[pos1].seqid==sbed[pos2].seqid:
            g.join(ia, ib)

    for group in sorted(g):
        (qflanker, syntelog), (far_flanker, far_syntelog), flanked = get_flanker(group, query)

        # y-boundary of the block
        gs = [x[1] for x in group]
        left, right = min(gs), max(gs)

        # run a mini-dagchainer here, take the direction that gives us most anchors
        orientation = "+"
        if colinear:
            y_indexed_group = [(y, i) for i, (x, y) in enumerate(group)]
            lis = longest_increasing_subsequence(y_indexed_group)
            lds = longest_decreasing_subsequence(y_indexed_group)

            if len(lis) >= len(lds):
                track = lis
            else:
                track = lds
                orientation = "-"

            group = [group[i] for (y, i) in track]

        xpos, ypos = zip(*group)
        score = min(len(set(xpos)), len(set(ypos)))

        if qflanker==query:
            gray = "S"
        else:
            gray = "G" if not flanked else "F"
            score -= 1 # slight penalty for not finding syntelog

        if score < cutoff: continue

        # this characterizes a syntenic region (left, right). syntelog is -1 if it's a gray gene
        syn_region = (syntelog, left, right, gray, orientation, score)
        regions.append(syn_region)

    return sorted(regions, key=lambda x: -x[-1]) # decreasing synteny score
 def to_groups(self, distance):
     # not used.
     g = Grouper()
     for name, anchors in itertools.groupby(self,
                                            key=lambda a, b:
                                            (a.seqid, b.seqid)):
         for ia, qa, sa in enumerate(anchors[:-1]):
             qb, sb = anchors[ia + 1]
             if qb.start - qa.end <= distance and sb.start - sa.end <= distance:
                 g.join((qa, sa), (qb, sb))
     return g
Beispiel #9
0
 def test_adding_grouper_objects_together(self):
     words1 = ["apple", "animal", "lemon", "ANIMAL", "Apple"]
     words2 = ["Lemon", "Animal", "Apple", "lemon"]
     word_groups = {
         "apple": ["apple", "Apple", "Apple"],
         "animal": ["animal", "ANIMAL", "Animal"],
         "lemon": ["lemon", "Lemon", "lemon"],
     }
     groups1 = Grouper(words1, key=str.lower)
     groups2 = Grouper(words2, key=str.lower)
     self.assertEqual(dict(groups1 + groups2), word_groups)
     groups3 = Grouper(words2, key=str.upper)
     with self.assertRaises(ValueError):
         groups1 + groups3  # Can't concatenate groups with different keys
Beispiel #10
0
 def system_construct(self):
     self.locals = Grouper(
         self._system_construct(self.options, self.params, self.configs))
     self.state.clear()
     for k in self.configs.keys():
         self.state[k] = SystemVar(self.locals[k], k)
     self.observables.clear()
     self.observables.update(self.locals)
     self.baselines.clear()
     self.baselines.update(
         {'mesh': fieldops.get_global_var_data(self.locals.mesh)})
     if hasattr(self.locals, 'obsVars'):
         self._fig = QuickFig(*self.locals.obsVars)
     else:
         self._fig = QuickFig(self.state[0])
Beispiel #11
0
 def test_init_accepts_mapping(self):
     dictionary = {
         "apple": ["Apple", "apple"],
         "lemon": ["lemon"],
     }
     groups = Grouper(dictionary, key=str.lower)
     self.assertEqual(dict(groups), dictionary)
Beispiel #12
0
    def setup_tables(self, full_table, bad_tables, good_tables, **kwargs):
        Basic.setup_tables(self, full_table, bad_tables, good_tables, **kwargs)
        self.grouper = Grouper(full_table, self) 

        self.SCORE_ID = add_meta_column(
                chain([full_table], bad_tables, good_tables),
                'SCOREVAR' 
        )
Beispiel #13
0
def job(grouperNum, chunksQueue, listSaveStateNameGrouper, listListLastCallNum):
    print 'Starting worker ' + str(grouperNum)      
    while True:
        # Get new chunck to process
        chunk = chunksQueue.get()         
        # Work
        print 'Worker ' + str(grouperNum) + ' mapping chunk ' + str(chunk)
        MapIterator = MapChunkIterator(mapChunksNameGenerator(chunk)) # Iterator to iterate through the chunck        
        theContext = MapContext(groupChunksNameGenerator(chunk),MapIterator)        
        Mapper.map(theContext)
        print 'Worker ' + str(grouperNum) + ' grouping locally chunck ' + str(chunk)        
        idx = listListLastCallNum[grouperNum]+1        
        theGrouper = Grouper(grouperNum,idx,idx-1,directory);        
        listSaveStateNameGrouper[grouperNum] = theGrouper.group(theContext)
        listListLastCallNum[grouperNum] = idx ;      
        # "Close" chunk
        chunksQueue.task_done()
Beispiel #14
0
 def test_strings(self):
     words = ["Apple", "animal", "apple", "ANIMAL", "animal"]
     word_groups = {
         "apple": ["Apple", "apple"],
         "animal": ["animal", "ANIMAL", "animal"],
     }
     groups = Grouper(words, key=str.lower)
     self.assertEqual(dict(groups), word_groups)
Beispiel #15
0
def tandem_grouper(bed, blast_list, tandem_Nmax=10, flip=True):
    if not flip:
        simple_blast = [(b.query, (b.sseqid, b.si)) for b in blast_list if b.evalue < 1e-10] 
    else:
        simple_blast = [(b.subject, (b.qseqid, b.qi)) for b in blast_list if b.evalue < 1e-10] 

    simple_blast.sort()

    standems = Grouper()
    for name, hits in itertools.groupby(simple_blast, key=lambda x:x[0]):
        # these are already sorted.
        hits = [x[1] for x in hits]
        for ia, a in enumerate(hits[:-1]):
            b = hits[ia + 1]
            # on the same chromosome and rank difference no larger than tandem_Nmax
            if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]: 
                standems.join(a[1], b[1])

    return standems
Beispiel #16
0
def tandem_grouper(bed, blast_list, tandem_Nmax=10, flip=True):
    if not flip:
        simple_blast = [(b.query, (b.sseqid, b.si)) for b in blast_list if b.evalue < 1e-10] 
    else:
        simple_blast = [(b.subject, (b.qseqid, b.qi)) for b in blast_list if b.evalue < 1e-10] 

    simple_blast.sort()

    standems = Grouper()
    for name, hits in itertools.groupby(simple_blast, key=lambda x:x[0]):
        # these are already sorted.
        hits = [x[1] for x in hits]
        for ia, a in enumerate(hits[:-1]):
            b = hits[ia + 1]
            # on the same chromosome and rank difference no larger than tandem_Nmax
            if b[1] - a[1] <= tandem_Nmax and b[0] == a[0]: 
                standems.join(a[1], b[1])

    return standems
def merge_clusters(chain, clusters):

    # there are, in general, two kinds of breakpoints
    # those that are induced by inversions, and those by translocations
    # inversion-breakpoints are excessive breakpoints that I want to remove
    
    chain_num = len(chain)
    mergeables = Grouper() # disjoint sets of clusters that can be merged
    for j in xrange(chain_num):
        cj = chain[j]
        mergeables.join(cj, cj)
        for i in xrange(j-1, -1, -1):
            ci = chain[i]
            del_x = distance_x(clusters[ci], clusters[cj])
            if del_x > Nmax: continue 

            del_y = distance_y(clusters[ci], clusters[cj])
            if del_x + del_y > Nmax: continue
            mergeables.join(ci, cj)

    to_merge = {} 
    for mergeable in mergeables:
        for m in mergeable:
            to_merge[m] = min(mergeables[m])

    merged_chain = []
    for c in chain:
        if to_merge[c]==c: # i.e. parent of mergeables
            merged_chain.append(c)

    # refresh clusters list, merge chains
    for k, v in to_merge.iteritems():
        if to_merge[k]!=k: # i.e. not map to self
            clusters[v].extend(clusters[k])

    # maintain the x-sort
    [cluster.sort() for cluster in clusters]

    # nothing is merged
    updated = (len(merged_chain) != chain_num)
    return merged_chain, updated
def load_geneorders(fp_gff):

    # load gene orders before any filtering

    fp_gff.seek(0)
    tandem = Grouper()
    print >>sys.stderr, "Read .genes file"
    # chromosome => gene_list in that chromosome
    chr_ranks = collections.defaultdict(list)
    ranks = {}  # gene => rank postion
    for row in fp_gff:
        chr, gene, start, stop = row.split()
        start = int(start)
        chr_ranks[chr].append((start, gene, chr))
        tandem.join(gene)
    for v in chr_ranks.itervalues():
        gene_rank = 0
        for start, gene, chr in sorted(v):
            ranks[gene] = (chr, gene_rank)
            gene_rank += 1
    return ranks, tandem
Beispiel #19
0
def load_geneorders(fp_gff):

    # load gene orders before any filtering

    fp_gff.seek(0)
    tandem = Grouper()
    print >> sys.stderr, "Read .genes file"
    # chromosome => gene_list in that chromosome
    chr_ranks = collections.defaultdict(list)
    ranks = {}  # gene => rank postion
    for row in fp_gff:
        chr, gene, start, stop = row.split()
        start = int(start)
        chr_ranks[chr].append((start, gene, chr))
        tandem.join(gene)
    for v in chr_ranks.itervalues():
        gene_rank = 0
        for start, gene, chr in sorted(v):
            ranks[gene] = (chr, gene_rank)
            gene_rank += 1
    return ranks, tandem
def single_linkage(points, xdist, ydist, N):

    # This is the core single linkage algorithm
    # this behaves in O(n) complexity: we iterate through the pairs, for each pair
    # we look back on the adjacent pairs to find links

    clusters = Grouper()
    n = len(points)
    points.sort()
    for i in xrange(n):
        for j in xrange(i-1, -1, -1):
            # x-axis distance
            del_x = points[i][0]-points[j][0]
            if del_x > xdist: break
            # y-axis distance
            del_y = points[i][1]-points[j][1]
            if abs(del_y) > ydist: continue
            # otherwise join
            clusters.join(points[i], points[j])
    clusters = [cluster for cluster in list(clusters) if score(cluster)>=N]
    return clusters
Beispiel #21
0
def mergeable(group1, group2, all_ranks, quota):

    # rule no.1- respect quota
    # rule no.2- but only count close genes once
    micro_grouper = Grouper() # data structure to check rule no.2
    merged_group = group1 + group2 # attempted merge
    nmerged = len(merged_group)

    # do all pairwise comparisons to find closely located genes
    # TODO: silly implementation, not efficient
    for i, genei in enumerate(merged_group):
        speciesi, chri, posi = all_ranks[genei]
        micro_grouper.join(genei)
        for j in xrange(i+1, nmerged):
            genej = merged_group[j]
            speciesj, chrj, posj = all_ranks[genej]
            if speciesi==speciesj and chri==chrj and abs(posi-posj)<=Tandem_Nmax/2:
                micro_grouper.join(genei, genej)

    species_count = collections.defaultdict(int) # data structure to check rule no.1
    for gene_group in micro_grouper:
        species = all_ranks[gene_group[0]][0]
        species_count[species] += 1

    for species, count in species_count.items():
        if count>quota[species]: 
            return False

    return True
 def check(self):
     print "\n** CHECKING", self.file, "**"
     records = parse_db(self.file)
     grouper = Grouper()
     
     #Check for consistancy in whether PV macros are followed by colons
     colon = None
     for r in records.keys():
         if colon is None:
             n = self.remove_macro(r, False)
             colon = n.startswith(':')
         else:
             n = self.remove_macro(r, False)
             if n.startswith(':') != colon:
                 if colon:
                     self.errors.append("FORMAT ERROR: " + r + " should have a colon after the macro")
                 else:
                     self.errors.append("FORMAT ERROR: " + r + " should not have a colon after the macro")
     
     groups = grouper.group_records(records)
     
     if self.debug:
         for s in groups.keys():
             print s, groups[s].RB, groups[s].SP, groups[s].SP_RBV
     
     for s in groups.keys():
         self.check_case(groups[s])
         self.check_chars(groups[s])
         self.check_candidates(groups[s], records)
             
     for w in self.warnings:
         print w
                 
     for e in self.errors:
         print e
                             
     print "** WARNING COUNT =", len(self.warnings), "**"
     print "** ERROR COUNT =", len(self.errors), "**"     
Beispiel #23
0
 def test_add_and_group_for_methods(self):
     names = ["Trey Hunner", "Monica Marshall", "Katherine Hunner"]
     def last_name(name): return name.rsplit()[-1]
     name_groups = Grouper(names, key=last_name)
     self.assertEqual(name_groups.group_for("Rose Hunner"), "Hunner")
     self.assertEqual(name_groups.group_for("Rose Klyce"), "Klyce")
     self.assertEqual(
         name_groups['Hunner'],
         ["Trey Hunner", "Katherine Hunner"],
     )
     name_groups.add('Rose Hunner')
     self.assertEqual(
         name_groups['Hunner'],
         ["Trey Hunner", "Katherine Hunner", "Rose Hunner"],
     )
     name_groups.add("Rose Klyce")
     self.assertEqual(name_groups['Klyce'], ["Rose Klyce"])
def single_linkage(points, max_dist=Nmax, min_cluster_size=N):
    """
    points are (x-index, y-index, cscore) per chromosome pair.
    """
    # This is the core single linkage algorithm
    # this behaves in O(n) complexity: we iterate through the pairs, for each pair
    # we look back on the adjacent pairs to find links

    clusters = Grouper()
    n = len(points)
    points.sort()
    for i in xrange(n):
        for j in xrange(i-1, -1, -1):
            # x-axis distance
            del_x = points[i][0]-points[j][0]
            if del_x > max_dist: break
            # y-axis distance
            del_y = points[i][1]-points[j][1]
            if del_x + abs(del_y) > max_dist: continue
            #if abs(del_y) > Nmax: continue
            # otherwise join
            clusters.join(points[i], points[j])
    clusters = [cluster for cluster in list(clusters) if len(cluster)>=min_cluster_size]
    return clusters
Beispiel #25
0
 def test_test_tuples_of_strings(self):
     animals = [
         ('agatha', 'dog'),
         ('kurt', 'cat'),
         ('margaret', 'mouse'),
         ('cory', 'cat'),
         ('mary', 'mouse'),
     ]
     animals_by_type = {
         'mouse': [('margaret', 'mouse'), ('mary', 'mouse')],
         'dog': [('agatha', 'dog')],
         'cat': [('kurt', 'cat'), ('cory', 'cat')],
     }
     groups = Grouper(animals, key=itemgetter(1))
     self.assertEqual(dict(groups), animals_by_type)
Beispiel #26
0
 def __init__(self):
     # Current call stack
     self.call_stack = ['__main__']
     # A mapping of which function called which other function
     self.call_dict = defaultdict(lambda: defaultdict(int))
     # Counters for each function
     self.func_count = defaultdict(int)
     self.func_count_max = 0
     self.call_stack_timer = []
     self.previous_event_return = False
     # Accumulative time per function
     self.func_time = defaultdict(float)
     self.func_time_max = 0
     self.trace_grouper = Grouper()
     self.painter = GraphvizOutput()
     self.painter.processor = self
Beispiel #27
0
 def construct(self):
     constructed = self._construct(p=self.inputs)
     localObj = Grouper(constructed)
     del localObj['p']
     try:
         del localObj['self']
     except KeyError:
         pass
     self._construct_check(localObj)
     self._locals = localObj
     self._stateVars = list()
     for k in self.configs.keys():
         var = self._locals[k]
         if isinstance(var, np.ndarray):
             var = SwarmVar(var, k, self.inputs)
         else:
             var = GlobeVar(var, k, self.inputs)
         self._stateVars.append(var)
Beispiel #28
0
 def test_custom_update_method(self):
     words = ["Apple", "animal", "apple", "ANIMAL", "animal"]
     word_groups = {
         "apple": ["Apple", "apple", "APPLE", "APPLE"],
         "animal": ["animal", "ANIMAL", "animal"],
         "lemon": ["lemon", "Lemon", "lemon", "LEMON"],
         "orange": ["Orange"],
     }
     more_items = {
         "apple": ["APPLE"],
         "lemon": ["lemon", "LEMON"],
         "orange": ["Orange"],
     }
     groups = Grouper(words, key=str.lower)
     groups.update(["lemon", "Lemon", "APPLE"])
     groups.update(more_items)
     self.assertEqual(dict(groups), word_groups)
def mergedSpeakers(chat):
    gps = Grouper()

    for comment in chat:
        if comment.thread > 0:
            spkr = comment.name
            gps.join(comment.name, comment.name)
            for ment in comment.mentioned:
                gps.join(comment.name, ment)

    gpToNum = {}
    for ctr, gp in enumerate(gps):
        gpToNum[tuple(gp)] = ctr + 1

    for comment in chat:
        if comment.thread > 0:
            gp = gps.find(comment.name)
            num = gpToNum[tuple(gp)]
            comment.thread = num

    return chat
Beispiel #30
0
    def __init__(self, **kwargs):

        si = self._sortedInputKeys['options']
        options = Grouper(OrderedDict([(k, self.inputs[k]) for k in si]))
        si = self._sortedInputKeys['params']
        params = Grouper(OrderedDict([(k, self.inputs[k]) for k in si]))
        si = self._sortedGhostKeys['configs']
        configs = Grouper(OrderedDict([(k, self.ghosts[k]) for k in si]))

        dOptions = options.copy()
        dOptions['hash'] = options.hashID
        dParams = params.copy()
        dParams['hash'] = params.hashID

        # self._systemObserverClasses = self.ghosts['observers']

        super().__init__(options=dOptions,
                         params=dParams,
                         supertype='System',
                         **kwargs)

        self.params, self.options = params, options
Beispiel #31
0
def make_family(gene_pairs, all_ranks, quota):
    
    print >>sys.stderr, "... gene family clustering started"

    g = Grouper() 

    gene_pairs.sort(reverse=True)
    #pprint.pprint(gene_pairs[:10])
    for synteny_score, gene1, gene2 in gene_pairs:
        # attempt to join the two genes

        g.join(gene1)
        g.join(gene2)
        group1, group2 = g[gene1], g[gene2]

        if mergeable(group1, group2, all_ranks, quota):
            g.join(gene1, gene2)

    return g
Beispiel #32
0
def merge_clusters(chain, clusters):

    # there are, in general, two kinds of breakpoints
    # those that are induced by inversions, and those by translocations
    # inversion-breakpoints are excessive breakpoints that I want to remove

    chain_num = len(chain)
    mergeables = Grouper()  # disjoint sets of clusters that can be merged
    for j in xrange(chain_num):
        cj = chain[j]
        mergeables.join(cj, cj)
        for i in xrange(j - 1, -1, -1):
            ci = chain[i]
            del_x = distance_x(clusters[ci], clusters[cj])
            if del_x > Nmax: continue

            del_y = distance_y(clusters[ci], clusters[cj])
            if del_x + del_y > Nmax: continue
            mergeables.join(ci, cj)

    to_merge = {}
    for mergeable in mergeables:
        for m in mergeable:
            to_merge[m] = min(mergeables[m])

    merged_chain = []
    for c in chain:
        if to_merge[c] == c:  # i.e. parent of mergeables
            merged_chain.append(c)

    # refresh clusters list, merge chains
    for k, v in to_merge.iteritems():
        if to_merge[k] != k:  # i.e. not map to self
            clusters[v].extend(clusters[k])

    # maintain the x-sort
    [cluster.sort() for cluster in clusters]

    # nothing is merged
    updated = (len(merged_chain) != chain_num)
    return merged_chain, updated
Beispiel #33
0
class MR(Basic):
    def __init__(self, *args, **kwargs):
        Basic.__init__(self, *args, **kwargs)
        self.best = []
        self.max_wait = kwargs.get('max_wait', 2 * 60 * 60)  # 2 hours
        self.start = None
        self.stop = False
        self.n_rules_checked = 0
        self.naive = kwargs.get('naive', False)
        self.max_bests = 50
        self.max_complexity = kwargs.get('max_complexity', 3)

        self.checkpoints = []

        self.cost_clique = 0

    def __hash__(self):
        components = [
            self.__class__.__name__,
            str(self.aggerr.__class__.__name__),
            str(set(self.cols)), self.epsilon, self.tau, self.p,
            self.err_func.__class__.__name__, self.tablename, self.aggerr.keys,
            self.max_wait, self.c_range
        ]
        components = map(str, components)
        return hash('\n'.join(components))

    def setup_tables(self, full_table, bad_tables, good_tables, **kwargs):
        Basic.setup_tables(self, full_table, bad_tables, good_tables, **kwargs)
        self.grouper = Grouper(full_table, self)

        self.SCORE_ID = add_meta_column(
            chain([full_table], bad_tables, good_tables), 'SCOREVAR')

    def set_params(self, **kwargs):
        self.cols = kwargs.get('cols', self.cols)
        self.params.update(kwargs)
        self.good_thresh = 0.0001
        self.granularity = kwargs.get('granularity', self.granularity)

    def make_rules(self, cur_groups):
        if cur_groups == None:
            new_groups = self.grouper.initial_groups()
        else:
            new_groups = self.grouper.merge_groups(cur_groups)

        rules = {}

        for attrs, groups in new_groups:
            start = time.time()
            for ro in self.grouper(attrs, groups):
                if self.max_wait:
                    self.n_rules_checked -= len(ro.rule.filter.conditions)
                    if self.n_rules_checked <= 0:
                        diff = time.time() - self.start
                        if not self.checkpoints or diff - self.checkpoints[-1][
                                0] > 10:
                            if self.best:
                                best_rule = max(self.best).rule
                                self.checkpoints.append((diff, best_rule))
                        self.stop = diff > self.max_wait
                        self.n_rules_checked = 1000
                    if self.stop:
                        _logger.debug("wait %d > %d exceeded." %
                                      (diff, self.max_wait))
                        return

                yield attrs, ro


#        print "group by\t%s\t%.4f" % (str([attr.name for attr in attrs]), time.time()-start)

    def __call__(self, full_table, bad_tables, good_tables, **kwargs):
        self.setup_tables(full_table, bad_tables, good_tables, **kwargs)
        self.update_status("running bottom up algorithm")
        for pairs in self.find_cliques():
            rules = [(b.rule, iteridx) for b, iteridx in pairs]
            yield rules
        self.update_status("bottom up algorithm done")

    def find_cliques(self):
        """
    table has been trimmed of extraneous columns.
    """
        #clusters = self.load_from_cache()
        #if clusters is not None:
        #yield clusters
        #return

        rules = None
        self.best = []
        self.start = time.time()

        added = []
        nseen = 0
        niters = 0
        while (niters < self.max_complexity and not self.stop
               and (rules is None or rules)):
            niters += 1
            self.update_status("running bottomup iter %d" % niters)
            _logger.debug("=========iter %d=========", niters)

            nadded = 0
            seen = set()
            nnewgroups = 0
            new_rules = defaultdict(list)

            # for each combination of attributes
            # prune the groups that are less influential than the parent group's
            #

            for attr, ro in self.make_rules(rules):
                nseen += 1
                if nseen % 50 == 0 and nseen > 0:
                    self.update_status("bottomup processed %d rules" % nseen)

                if self.stop:
                    break

                if self.top_k(ro):
                    nadded += 1

                if self.naive:
                    new_rules[attr] = [None]
                    nnewgroups += 1
                elif self.prune_rule(ro):
                    new_rules[attr].append(ro.group)
                    nnewgroups += 1
                ro.rule.__examples__ = None

                if nadded % 25 == 0 and nadded > 0:
                    newbests = filter(lambda c: c not in seen, self.best)
                    seen.update(self.best)
                    yield zip(newbests, [niters] * len(newbests))

            newbests = filter(lambda c: c not in seen, self.best)
            seen.update(self.best)
            yield zip(newbests, [niters] * len(newbests))
            if not nadded:
                break

            rules = new_rules
            if niters == 1:
                best = self.best
            else:
                best = set(self.best)
                if prev_best and prev_best in self.best:
                    self.best.remove(prev_best)
                best = list(best)

            self.best = [max(self.best)] if self.best else []
            prev_best = max(self.best) if self.best else None

        _logger.debug("finished, merging now")
        self.cost_clique = time.time() - self.start

        #self.cache_results(clusters)

    def prune_rule(self, ro):
        if ro.npts < self.min_pts:
            _logger.debug("prune? %s\t%s", 'FALSE', str(ro))
            return False

        if (math.isnan(ro.bad_inf) or math.isnan(ro.good_inf)
                or math.isnan(ro.inf)):
            _logger.debug("prune? %s\t%s", 'FALSE', str(ro))
            return False

        # assuming the best case (the good_stat was zero)
        # would the influence beat the best so far across
        # the full c_range?
        if self.best:
            if ro.dominated_by(max(self.best)):
                _logger.debug("prune? %s\t%s", 'FALSE', str(ro))
                return False

        #if self.best and ro.best_inf <= max(self.best).inf:
        #    # if best tuple influence < rule influence:
        #    if ro.best_tuple_inf <= max(self.best).inf:
        #        _logger.debug("%s\t%s", 'FALSE', str(ro))
        #        return False

        # check max good influence
        if False and ro.good_inf < self.good_thresh:
            # TODO: can skip computing good_stats
            ro.good_skip = True

        #_logger.debug("%s\t%.4f\t%s", 'T', self.best and max(self.best).inf or 0, str(ro))
        return True

    def top_k(self, ro):
        n = 0
        best = self.best and max(self.best, key=lambda ro: ro.inf) or None
        if len(self.best) >= self.max_bests:
            bound = best.inf - self.best[0].inf
            thresh = self.best[0].inf + bound * 0.02
            if ro.inf <= thresh:
                return False

        if ro in self.best:
            return False
        if math.isnan(ro.inf):
            return False

        if len(self.best) < self.max_bests:
            n += 1
            _logger.debug(str(ro))
            heapq.heappush(self.best, ro)
        else:
            n += 1
            _logger.debug(str(ro))
            heapq.heapreplace(self.best, ro)

        best = best and max(best, ro) or ro

        return True

    @instrument
    def load_from_cache(self):
        import bsddb as bsddb3
        self.cache = bsddb3.hashopen('./dbwipes.mr.cache')
        try:
            myhash = str(hash(self))
            if myhash in self.cache and self.use_cache:
                self.update_status("loading partitions from cache")
                dicts, errors = json.loads(self.cache[myhash])
                clusters = map(Cluster.from_dict, dicts)
                for c in clusters:
                    self.influence_cluster(c, self.full_table)
                return clusters
        except Exception as e:
            print e
            pdb.set_trace()
            pass
        finally:
            self.cache.close()
        return None

    @instrument
    def cache_results(self, clusters):
        import bsddb as bsddb3
        # save the clusters in a dictionary
        if self.use_cache:
            myhash = str(hash(self))
            self.cache = bsddb3.hashopen('./dbwipes.mr.cache')
            try:
                dicts = [c.to_dict() for c in clusters]
                errors = [c.error for c in clusters]
                self.cache[myhash] = json.dumps((dicts, errors))
            except Exception as e:
                print e
                pdb.set_trace()
                pass
            finally:
                self.cache.close()
Beispiel #34
0
chunksQueue.join()
print 'All workers have finished.'

print 'Mapping and local grouping done. {} chuncks grouped by {} threads.'.format(cf.nChunks, totalNumberOfGrouper)

################
# Global group #
################
print '------------------'
print('Global grouping...')
print '------------------'
listOfDirectory = []
globalGrouperDirectory = '/Users/lcambier/TempMapReduce/mapper_and_groupper_logs2/'
for i in range(0,totalNumberOfGrouper):
    listOfDirectory.append('/Users/lcambier/TempMapReduce/mapper_and_groupper_logs2/')
globalDict = Grouper.globalGrouper(saveStateNameGrouper,listGrouperNum,listLastCallNum,listOfDirectory,globalGrouperDirectory)
print('Global grouping done.')

############
# Reducing #
############
print '------------'
print('Reducing ...')
print '------------'
outputDict = dict()
for key, globalNodeFileName in globalDict.iteritems():
    reduceIterator = ReduceFromGroupIterator(globalNodeFileName)
    theReduceContext = ReduceContext(key,reduceIterator)
    outputDict[key] = Reducer.reduce(theReduceContext)
print('Reducing done.')
Beispiel #35
0
class MR(Basic):

  def __init__(self, *args, **kwargs):
    Basic.__init__(self, *args, **kwargs)
    self.best = []
    self.max_wait = kwargs.get('max_wait', 2 * 60 * 60) # 2 hours
    self.start = None
    self.stop = False
    self.n_rules_checked = 0
    self.naive = kwargs.get('naive', False)
    self.max_bests = 50
    self.max_complexity = kwargs.get('max_complexity', 3)

    self.checkpoints = []

    self.cost_clique = 0


  def __hash__(self):
    components = [
      self.__class__.__name__,
      str(self.aggerr.__class__.__name__),
      str(set(self.cols)),
      self.epsilon,
      self.tau,
      self.p,
      self.err_func.__class__.__name__,
      self.tablename,
      self.aggerr.keys,
      self.max_wait,
      self.c_range
    ]
    components = map(str, components)
    return hash('\n'.join(components))


  def setup_tables(self, full_table, bad_tables, good_tables, **kwargs):
    Basic.setup_tables(self, full_table, bad_tables, good_tables, **kwargs)
    self.grouper = Grouper(full_table, self) 

    self.SCORE_ID = add_meta_column(
            chain([full_table], bad_tables, good_tables),
            'SCOREVAR' 
    )



  def set_params(self, **kwargs):
    self.cols = kwargs.get('cols', self.cols)
    self.params.update(kwargs)
    self.good_thresh = 0.0001
    self.granularity = kwargs.get('granularity', self.granularity)

  def make_rules(self, cur_groups):
    if cur_groups == None:
      new_groups = self.grouper.initial_groups()
    else:
      new_groups = self.grouper.merge_groups(cur_groups)

    rules = {}

    for attrs, groups in new_groups:
      start = time.time()
      for ro in self.grouper(attrs, groups):
        if self.max_wait:
          self.n_rules_checked -= len(ro.rule.filter.conditions)
          if self.n_rules_checked <= 0:
            diff = time.time() - self.start
            if not self.checkpoints or diff - self.checkpoints[-1][0] > 10:
              if self.best:
                best_rule = max(self.best).rule
                self.checkpoints.append((diff, best_rule))
            self.stop = diff > self.max_wait
            self.n_rules_checked = 1000
          if self.stop:
            _logger.debug("wait %d > %d exceeded." % (diff, self.max_wait))
            return


        yield attrs, ro
#        print "group by\t%s\t%.4f" % (str([attr.name for attr in attrs]), time.time()-start)



  def __call__(self, full_table, bad_tables, good_tables, **kwargs):
    self.setup_tables(full_table, bad_tables, good_tables, **kwargs)
    self.update_status("running bottom up algorithm")
    for pairs in self.find_cliques():
      rules = [(b.rule, iteridx) for b, iteridx in pairs]
      yield rules
    self.update_status("bottom up algorithm done")


  def find_cliques(self):
    """
    table has been trimmed of extraneous columns.
    """
    #clusters = self.load_from_cache()
    #if clusters is not None:
      #yield clusters
      #return 

    rules = None
    self.best = []
    self.start = time.time()

    added = []
    nseen = 0
    niters = 0 
    while (niters < self.max_complexity and 
           not self.stop and 
           (rules is None or rules)):
      niters += 1
      self.update_status("running bottomup iter %d" % niters)
      _logger.debug("=========iter %d=========", niters)

      nadded = 0
      seen = set()
      nnewgroups = 0
      new_rules = defaultdict(list)
      
      # for each combination of attributes
      # prune the groups that are less influential than the parent group's 
      #  

      for attr, ro in self.make_rules(rules):
        nseen += 1
        if nseen % 50 == 0 and nseen > 0:
          self.update_status("bottomup processed %d rules" % nseen)

        if self.stop:
            break

        if self.top_k(ro):
          nadded += 1

        if self.naive:
            new_rules[attr] = [None]
            nnewgroups += 1
        elif self.prune_rule(ro):
            new_rules[attr].append(ro.group)
            nnewgroups += 1
        ro.rule.__examples__ = None


        if nadded % 25 == 0 and nadded > 0:
          newbests = filter(lambda c: c not in seen, self.best)
          seen.update(self.best)
          yield zip(newbests, [niters]*len(newbests))


      newbests = filter(lambda c: c not in seen, self.best)
      seen.update(self.best)
      yield zip(newbests, [niters]*len(newbests))
      if not nadded: 
        break 

      rules = new_rules
      if niters == 1:
        best = self.best
      else:
        best = set(self.best)
        if prev_best and prev_best in self.best:
          self.best.remove(prev_best)
        best = list(best)

      self.best = [max(self.best)] if self.best else []
      prev_best = max(self.best) if self.best else None

    _logger.debug("finished, merging now")
    self.cost_clique = time.time() - self.start

    #self.cache_results(clusters)

  def prune_rule(self, ro):
    if ro.npts < self.min_pts:
        _logger.debug("prune? %s\t%s", 'FALSE', str(ro))
        return False
    
    if (math.isnan(ro.bad_inf) or
        math.isnan(ro.good_inf) or
        math.isnan(ro.inf)):
        _logger.debug("prune? %s\t%s", 'FALSE', str(ro))
        return False
    

    # assuming the best case (the good_stat was zero)
    # would the influence beat the best so far across
    # the full c_range?
    if self.best:
      if ro.dominated_by(max(self.best)):
        _logger.debug("prune? %s\t%s", 'FALSE', str(ro))
        return False

    #if self.best and ro.best_inf <= max(self.best).inf:
    #    # if best tuple influence < rule influence:
    #    if ro.best_tuple_inf <= max(self.best).inf:
    #        _logger.debug("%s\t%s", 'FALSE', str(ro))
    #        return False

    # check max good influence
    if False and ro.good_inf < self.good_thresh:
        # TODO: can skip computing good_stats
        ro.good_skip = True


    #_logger.debug("%s\t%.4f\t%s", 'T', self.best and max(self.best).inf or 0, str(ro))
    return True


  def top_k(self, ro):
    n = 0
    best = self.best and max(self.best, key=lambda ro: ro.inf) or None
    if len(self.best) >= self.max_bests:
      bound = best.inf - self.best[0].inf
      thresh = self.best[0].inf + bound * 0.02
      if ro.inf <= thresh:
        return False

    if ro in self.best:
      return False
    if math.isnan(ro.inf):
      return False

    
    if len(self.best) < self.max_bests:
      n += 1            
      _logger.debug(str(ro))
      heapq.heappush(self.best, ro)
    else:
      n += 1            
      _logger.debug(str(ro))
      heapq.heapreplace(self.best, ro)
    
    best = best and max(best, ro) or ro

    return True

  @instrument
  def load_from_cache(self):
    import bsddb as bsddb3
    self.cache = bsddb3.hashopen('./dbwipes.mr.cache')
    try:
      myhash = str(hash(self))
      if myhash in self.cache and self.use_cache:
        self.update_status("loading partitions from cache")
        dicts, errors = json.loads(self.cache[myhash])
        clusters = map(Cluster.from_dict, dicts)
        for c in clusters:
          self.influence_cluster(c, self.full_table)
        return clusters
    except Exception as e:
      print e
      pdb.set_trace()
      pass
    finally:
      self.cache.close()
    return None


  @instrument
  def cache_results(self, clusters):
    import bsddb as bsddb3
    # save the clusters in a dictionary
    if self.use_cache:
      myhash = str(hash(self))
      self.cache = bsddb3.hashopen('./dbwipes.mr.cache')
      try:
        dicts = [c.to_dict() for c in clusters]
        errors = [c.error for c in clusters]
        self.cache[myhash] = json.dumps((dicts, errors))
      except Exception as e:
        print e
        pdb.set_trace()
        pass
      finally:
        self.cache.close()
Beispiel #36
0
class System(Observable, Chroner, Wanderer):
    def __init__(self, **kwargs):

        si = self._sortedInputKeys['options']
        options = Grouper(OrderedDict([(k, self.inputs[k]) for k in si]))
        si = self._sortedInputKeys['params']
        params = Grouper(OrderedDict([(k, self.inputs[k]) for k in si]))
        si = self._sortedGhostKeys['configs']
        configs = Grouper(OrderedDict([(k, self.ghosts[k]) for k in si]))

        dOptions = options.copy()
        dOptions['hash'] = options.hashID
        dParams = params.copy()
        dParams['hash'] = params.hashID

        # self._systemObserverClasses = self.ghosts['observers']

        super().__init__(options=dOptions,
                         params=dParams,
                         supertype='System',
                         **kwargs)

        self.params, self.options = params, options

    def system_construct(self):
        self.locals = Grouper(
            self._system_construct(self.options, self.params, self.configs))
        self.state.clear()
        for k in self.configs.keys():
            self.state[k] = SystemVar(self.locals[k], k)
        self.observables.clear()
        self.observables.update(self.locals)
        self.baselines.clear()
        self.baselines.update(
            {'mesh': fieldops.get_global_var_data(self.locals.mesh)})
        if hasattr(self.locals, 'obsVars'):
            self._fig = QuickFig(*self.locals.obsVars)
        else:
            self._fig = QuickFig(self.state[0])

    @property
    def constructed(self):
        return hasattr(self, 'locals')

    def _configurable_changed_state_hook(self):
        for k, v in self.configs.items():
            if v is Ellipsis:
                self.state[k].var.data[...] = self.state[k]._initialData

    def _voyager_changed_state_hook(self):
        super()._voyager_changed_state_hook()
        assert self.constructed
        for var in self.state:
            var.update()
        self.locals.solve()

    def _iterate(self):
        dt = self.locals.integrate()
        self.indices.chron.value += dt
        super()._iterate()

    def _out(self):
        outs = super()._out()
        add = self.evaluate()
        outs.update(add)
        return outs

    def _evaluate(self):
        if hasattr(self, 'locals'):
            add = {vn: mut.data for vn, mut in self.state.items()}
        else:
            add = {vn: OutsNull for vn in self.configs.keys()}
        return add

    def _save(self):
        super()._save()
        self.writer.add_dict(self.baselines, 'baselines')

    @_system_construct_if_necessary
    def _load_process(self, outs):
        outs = super()._load_process(outs)
        for key, mut in self.state.items():
            mut.mutate(outs.pop(key))
        return outs

    @property
    def fig(self):
        if self.indices.isnull or not hasattr(self, 'locals'):
            raise Exception("Nothing to show yet.")
        return self._fig

    def show(self):
        self.fig.show()

    def _observation_mode_hook(self):
        if self.indices.isnull:
            self.initialise()
        super()._observation_mode_hook()
Beispiel #37
0
 def test_lookups(self):
     words = ["Apple", "animal", "apple", "ANIMAL", "animal"]
     groups = Grouper(words, key=str.lower)
     self.assertEqual(groups['apple'], ["Apple", "apple"])
Beispiel #38
0
 def test_containment(self):
     words = ["Apple", "animal", "apple", "ANIMAL", "animal"]
     groups = Grouper(words, key=str.lower)
     self.assertIn('apple', groups)
Beispiel #39
0
        down_wall = Wall()
        current_cell.down_wall = down_wall
        right_wall = Wall()
        current_cell.right_wall = right_wall
        if y != Y-1:
            down_wall.neighbours = (current_cell, cells[(x,y+1)])
            walls.append(down_wall)

        if x != X-1:
            right_wall.neighbours = (current_cell, cells[(x+1,y)])
            walls.append(right_wall)


cell_list = [cells[key] for key in cells]

maze = Grouper(cell_list)

for _ in range(len(walls)):
    
    wall = popchoice(walls)
    cell_1, cell_2 = wall.neighbours
    
    if not maze.joined(cell_1, cell_2):
        wall.active = False
        maze.join(cell_1, cell_2)



maze_map = []

x_max = (X*2)+1
Beispiel #40
0
 def test_no_iterable_given(self):
     groups = Grouper(key=str.lower)
     self.assertEqual(dict(groups), {})
Beispiel #41
0
    def setup_tables(self, full_table, bad_tables, good_tables, **kwargs):
        Basic.setup_tables(self, full_table, bad_tables, good_tables, **kwargs)
        self.grouper = Grouper(full_table, self)

        self.SCORE_ID = add_meta_column(
            chain([full_table], bad_tables, good_tables), 'SCOREVAR')
Beispiel #42
0
class MR(Basic):


    def __init__(self, *args, **kwargs):
        Basic.__init__(self, *args, **kwargs)
        self.best = []
        self.max_wait = kwargs.get('max_wait', 2 * 60 * 60) # 2 hours
        self.start = None
        self.stop = False
        self.n_rules_checked = 0
        self.naive = kwargs.get('naive', False)
        self.max_bests = 50
        self.max_complexity = kwargs.get('max_complexity', 3)

        self.checkpoints = []

        self.cost_clique = 0

    def __hash__(self):
        components = [
          self.__class__.__name__,
          str(self.aggerr.__class__.__name__),
          str(set(self.cols)),
          self.epsilon,
          self.tau,
          self.p,
          self.err_func.__class__.__name__,
          self.tablename,
          self.aggerr.keys,
          self.max_wait,
          self.c_range
        ]
        components = map(str, components)
        return hash('\n'.join(components))
 

    def setup_tables(self, full_table, bad_tables, good_tables, **kwargs):
        Basic.setup_tables(self, full_table, bad_tables, good_tables, **kwargs)
        self.grouper = Grouper(full_table, self) 

        self.SCORE_ID = add_meta_column(
                chain([full_table], bad_tables, good_tables),
                'SCOREVAR' 
        )




    def set_params(self, **kwargs):
        self.cols = kwargs.get('cols', self.cols)
        self.params.update(kwargs)
        self.max_bad_inf = -1e1000000
        self.good_thresh = 0.0001
        self.granularity = kwargs.get('granularity', self.granularity)

    def make_rules(self, cur_groups):
      if cur_groups == None:
        new_groups = self.grouper.initial_groups()
      else:
        new_groups = self.grouper.merge_groups(cur_groups)

      rules = {}

      for attrs, groups in new_groups:
        start = time.time()
        for ro in self.grouper(attrs, groups):

          if self.max_wait:
            self.n_rules_checked -= len(ro.rule.filter.conditions)
            if self.n_rules_checked <= 0:
              diff = time.time() - self.start
              if not self.checkpoints or diff - self.checkpoints[-1][0] > 10:
                if self.best:
                  best_rule = max(self.best, key=lambda r: r.inf).rule
                  self.checkpoints.append((diff, best_rule))
              self.stop = diff > self.max_wait
              self.n_rules_checked = 1000
            if self.stop:
              _logger.debug("wait %d > %d exceeded." % (diff, self.max_wait))
              return


          yield attrs, ro
#        print "group by\t%s\t%.4f" % (str([attr.name for attr in attrs]), time.time()-start)



    def __call__(self, full_table, bad_tables, good_tables, **kwargs):
      self.setup_tables(full_table, bad_tables, good_tables, **kwargs)

      clusters = self.find_cliques()
      clusters = self.merge_rules(clusters)
      clusters = filter(lambda c: r_vol(c.c_range), clusters)
      clusters.sort(reverse=True)
      self.all_clusters = self.final_clusters = clusters
      return clusters

      self.best.sort(reverse=True)
      return self.merge_rules(self.best)



    def find_cliques(self):
      """
      table has been trimmed of extraneous columns.
      """
      clusters = self.load_from_cache()
      if clusters is not None:
        return clusters

      rules = None
      self.opts_per_iter = []
      self.best = []
      self.start = time.time()

      nseen = 0
      niters = 0 
      while niters < self.max_complexity and not self.stop and (rules is None or rules):
          niters += 1
          _logger.debug("=========iter %d=========", niters)
          besthash = hash(tuple(self.best))

          nadded = 0
          nnewgroups = 0
          new_rules = defaultdict(list)
          
          # for each combination of attributes
          # prune the groups that are less influential than the parent group's 
          #  

          for attr, ro in self.make_rules(rules):
              nseen += 1
              if self.stop:
                  break
              nadded += self.top_k((ro,))
              if self.naive:
                  new_rules[attr] = [None]
                  nnewgroups += 1
              elif self.prune_rule(ro):
                  new_rules[attr].append(ro.group)
                  nnewgroups += 1
              ro.rule.__examples__ = None
              if nnewgroups % 10000 == 0:
                  pass
                  #print "# new groups\t", nnewgroups, '\t', time.time()-self.start, self.max_wait



          if not nadded: 
              pass
#                break

          rules = new_rules
          if niters == 1:
              self.opts_per_iter.append(list(self.best))
          else:
              self.opts_per_iter.append(list(self.best[1:]))
              if prev_best and prev_best in self.opts_per_iter[-1]:
                  self.opts_per_iter[-1].remove(prev_best)
          self.best = [max(self.best)] if self.best else []
          prev_best = max(self.best) if self.best else None


      _logger.debug("finished, merging now")
      self.cost_clique = time.time() - self.start


      ret = []
      for bests in self.opts_per_iter:
          bests.sort(reverse=True)
          ret.extend(bests)# self.merge_rules(bests))
      clusters = map(self.blah_to_cluster, ret)

      self.cache_results(clusters)
      return clusters

    def blah_to_cluster(self, blah):
      rule = blah.rule
      fill_in_rules([rule], self.full_table, self.cols)
      c = Cluster.from_rule(rule, self.cols)
      c.error = self.influence_cluster(c, self.full_table)
      return c


    def merge_rules(self, clusters):
      start = time.time()

      clusters = filter_bad_clusters(clusters)
      thresh = compute_clusters_threshold(clusters, nstds=0.)
      is_mergable = lambda c: c.error >= thresh
      is_mergable = lambda c: True
      influence_f = lambda c: self.influence_cluster(c, self.full_table)
      params = dict(self.params)
      params.update({
        'learner_hash': hash(self),
        'cols' : self.cols,
        'influence' : influence_f,
        'is_mergable' : is_mergable,
        'c_range': self.c_range,
        'use_mtuples' : False,
        'learner' : self,
        'partitions_complete' : False
      })
      self.merger = RangeMerger(**params)
      #self.merger = Merger(**params)
      self.final_clusters = self.merger(clusters)
      self.all_clusters = clusters
      self.cost_merge = time.time() - start

      self.costs = {
              'cost_clique' : self.cost_clique,
              'cost_merge' : self.cost_merge
      }

      return self.final_clusters


    def prune_rules(self, rules):
      ret = defaultdict(set)
      for key, ros in rules.iteritems():
          for ro in ros:
              if self.prune_rule(ro):
                  ret[key].add(ro)
      return ret
    
    def prune_rule(self, ro):
      # update bad influence bounds
      self.max_bad_inf = max(self.max_bad_inf, ro.bad_inf)
      self.bad_thresh = max(self.bad_thresh, 0.01 * self.max_bad_inf)

      if ro.npts < self.min_pts:
          _logger.debug("%s\t%s", 'FALSE', str(ro))
          return False
      
      if (math.isnan(ro.bad_inf) or
          math.isnan(ro.good_inf) or
          math.isnan(ro.inf)):
          _logger.debug("%s\t%s", 'FALSE', str(ro))
          return False
      
      # check min bad influence
      #if ro.bad_inf < self.bad_thresh:
      #    return False


      # assuming the best case (the good_stat was zero)
      # would the influence beat the best so far across
      # the full c_range?
      if self.best:
        if ro.dominated_by(max(self.best)):
          return False

      #if self.best and ro.best_inf <= max(self.best).inf:
      #    # if best tuple influence < rule influence:
      #    if ro.best_tuple_inf <= max(self.best).inf:
      #        _logger.debug("%s\t%s", 'FALSE', str(ro))
      #        return False

      # check max good influence
      if ro.good_inf < self.good_thresh:
          # TODO: can skip computing good_stats
          ro.good_skip = True


      #_logger.debug("%s\t%.4f\t%s", 'T', self.best and max(self.best).inf or 0, str(ro))
      return True


    def top_k(self, rules):
      n = 0
      best = self.best and max(self.best, key=lambda ro: ro.inf) or None
      for ro in rules:
          if len(self.best) >= self.max_bests:
              bound = best.inf - self.best[0].inf
              thresh = self.best[0].inf + bound * 0.02
              if ro.inf <= thresh:
                  continue
          if ro in self.best:
              continue
          if math.isnan(ro.inf):
              continue

          if not best or ro.inf > best.inf:
              n += 1            
              _logger.debug(str(ro))

          if len(self.best) < self.max_bests:
              heapq.heappush(self.best, ro)
          else:
              heapq.heapreplace(self.best, ro)
          
          best = best and max(best, ro) or ro

      return n

    @instrument
    def load_from_cache(self):
      import bsddb as bsddb3
      self.cache = bsddb3.hashopen('./dbwipes.mr.cache')
      try:
        myhash = str(hash(self))
        if myhash in self.cache and self.use_cache:
          dicts, errors = json.loads(self.cache[myhash])
          clusters = map(Cluster.from_dict, dicts)
          return clusters
      except Exception as e:
        print e
        pdb.set_trace()
        pass
      finally:
        self.cache.close()
      return None


    @instrument
    def cache_results(self, clusters):
      import bsddb as bsddb3
      # save the clusters in a dictionary
      if self.use_cache:
        myhash = str(hash(self))
        self.cache = bsddb3.hashopen('./dbwipes.mr.cache')
        try:
          dicts = [c.to_dict() for c in clusters]
          errors = [c.error for c in clusters]
          self.cache[myhash] = json.dumps((dicts, errors))
        except Exception as e:
          print e
          pdb.set_trace()
          pass
        finally:
          self.cache.close()