class Actor: def __init__(self, args): self.estimator = Estimator(emb_dim=args.emb_dim, n_hidden=args.n_hidden, bidirectional=args.bi, n_layer=args.n_layer, dropout=args.dropout, lr=args.lr, decay=args.decay, lr_p=args.lr_p, clip=args.clip, batch_size=args.batch, epoch_num=args.epoch_num, cuda=args.cuda, path=args.path) self.transformer = Transformer(prolog_grammar.GRAMMAR_DICTIONARY, prolog_grammar.ROOT_RULE) self.performances = [] self.actions = [] self.path = args.path def search(self): self.perform('initial') #exit(0) for i in range(25): print(i) try: self.step() self.perform(i) except BaseException as e: print(e) print(self.actions) print(self.performances) with open('gra.pkl', 'wb') as f: pickle.dump(self.transformer.get_grammar_dict(), f) exit(-1) print(self.performances) #exit(0) def step(self): import time t1 = time.time() action_space = self.transformer.get_act_space() t2 = time.time() method = [] i = -1 while len(method) == 0: i = random.randint(0, 3) method = action_space[i] action = random.choice(method) print(i, action) if i == 0: self.transformer.creat_nt(action) elif i == 1: self.transformer.merge_nt(action) elif i == 2: self.transformer.combine_nt(*action) else: assert i == 3 self.transformer.delete_prod(action) self.actions.append((i, action)) def perform(self, name): grammar_dict, root_rule = self.transformer.get_grammar_dict() with open(os.path.join(self.path, f'grammar-{name}'), 'wb') as f: pickle.dump(self.transformer, f) perform = self.estimator.estimate(grammar_dict, root_rule, toy=False, name=repr(name)) self.performances.append(perform) print(perform) return perform def exp(self, name): for _ in range(100): self.step() self.perform(name) def one(self): #with open(path, 'rb') as f: # self.transformer = pickle.load(f) for i in range(50): self.step() grammar_dict, root_rule = self.transformer.get_grammar_dict() for i in range(10000): perform = self.estimator.estimate(grammar_dict, root_rule, toy=False, name='tmp') print(perform)
def alg(sc, data_set_rdd, data_set_size, threshold, epsilon, randomized=True, alpha=0.1): data_set_rdd.cache() partitions_num = data_set_rdd.getNumPartitions() sample_size = _calculate_sample_size_2(threshold, data_set_size, epsilon, alpha) collected_sample = data_set_rdd.sample(False, float(sample_size) / data_set_size).collect() collected_sample2 = data_set_rdd.sample(False, float(sample_size) / data_set_size).collect() collected_sample3 = data_set_rdd.sample(False, float(sample_size) / data_set_size).collect() # collected_sample4 = data_set_rdd.sample(False, float(sample_size) / data_set_size).collect() # collected_sample5 = data_set_rdd.sample(False, float(sample_size) / data_set_size).collect() log.info('Using sample of size %d', sample_size) print 'Using sample of size %d' % sample_size print 'ratio - %f' % (float(sample_size) / data_set_size) scaled_threshold = float( threshold) * sample_size / data_set_size if randomized else threshold frequencies1 = _countElements( collected_sample, float(threshold) * sample_size / data_set_size) common_elements1 = set(frequencies1.keys()) frequencies2 = _countElements( collected_sample2, float(threshold) * sample_size / data_set_size) common_elements2 = set(frequencies2.keys()) del collected_sample2 frequencies3 = _countElements( collected_sample3, float(threshold) * sample_size / data_set_size) common_elements3 = set(frequencies3.keys()) del collected_sample3 # frequencies4 = _countElements(collected_sample4, float(threshold) * sample_size / data_set_size) # common_elements4 = set(frequencies4.keys()) # del collected_sample4 # frequencies5 = _countElements(collected_sample5, float(threshold) * sample_size / data_set_size) # common_elements5 = set(frequencies5.keys()) # del collected_sample5 common_candidates = common_elements1.union(common_elements2).union( common_elements3) #.union(common_elements4).union(common_elements5) common_elements_set = set() for candidate in common_candidates: i = 0 if candidate in common_elements1: i += 1 if candidate in common_elements2: i += 1 if candidate in common_elements3: i += 1 # if candidate in common_elements4: # i += 1 # if candidate in common_elements5: # i += 1 if i >= 2: common_elements_set.add(candidate) # frequencies = _get_averages(frequencies1, frequencies2, frequencies3) # common_elements = [k for k in frequencies.keys() if frequencies[k] >= float(threshold) * sample_size / data_set_size] common_elements = list(common_elements_set) data_estimator = Estimator(sc.parallelize(collected_sample)) if randomized \ else Estimator(data_set_rdd) # log.info('Estimating singletons frequencies') # start = time.time() # log.info('There are %d common elements', len(common_elements)) # log.info('Common elements are - %s', common_elements) # end = time.time() # log.info('Singletons frequencies computation completed in %d seconds', end - start) # singletons = [(set([item]), frequencies[item] * data_set_size / sample_size) for item in common_elements] singletons = data_estimator.getSingletons() # common_elements = data_estimator.estimate(singletons) cis_tree = frequents.Frequents() # common_cached = data_estimator.estimate_commons(singletons.collect(), scaled_threshold) candidates = [set([i]) for i in common_elements] iteration = 1 scaling_factor = data_set_size / sample_size if randomized else 1.0 while candidates: log.info('Iteration %d starts. candidates set size is %d', iteration, len(candidates)) log.info('Starting Estimating and filtering. There are %d candidates', len(candidates)) start = time.time() next_level = data_estimator.estimate(candidates).filter( lambda pair: pair[1][1] >= scaled_threshold).map(lambda x: (x[1][ 0], int(min(x[1][1] * scaling_factor, data_set_size)))) next_level.cache() cis_next_level = next_level.collect() cis_next_level = filter(lambda x: x[0].issubset(common_elements_set), cis_next_level) end = time.time() log.info( 'Estimation and filter done in %d seconds. Filtering candidates', end - start) if not cis_next_level: log.info('No candidates remained. Quiting iteration %d', iteration) break log.info( 'Adding new computed level to the resulting lattice, of size %d', len(cis_next_level)) log.info('New level is - %s', cis_next_level) start = time.time() cis_tree.add_level(cis_next_level) end = time.time() log.info('Next level addition to lattice completed in %d seconds', end - start) start = time.time() candidates = _expand(next_level, common_elements, partitions_num) end = time.time() log.info( 'Fast expansion took %d seconds and created %d candidates, Iteration %d completed', end - start, len(candidates), iteration) log.info('New candidates are %s', candidates) iteration += 1 if not randomized: cis_tree.result = [(itemset.items, itemset.frequency) for itemset in cis_tree.get_all()] cis_tree.result = { str(sorted(list(i[0]))): i[1] for i in cis_tree.result } return cis_tree # return cis_tree estimator = Estimator(data_set_rdd) final_itemsets = [itemset.items for itemset in cis_tree.get_all()] cis_tree.result = estimator.compute(final_itemsets).collect() cis_tree.result = { str(sorted(list(json.loads(i[0])))): i[1] for i in cis_tree.result } return cis_tree