def mass_annotate_mp(inputs, vectorizer, score_attribute='importance', estimator=None, multi_process=False, invert_score=False): ''' graph annotation is slow. i dont want to do it twice in fit and predict :) ''' # 1st check if already annotated if inputs[0].graph.get('mass_annotate_mp_was_here', False): return inputs if multi_process == False: inputs = filter(lambda v: v is not None, inputs) res = list(vectorizer.annotate(inputs, estimator=estimator)) #if invert_score: # def f(n,d): d['importance'] = -d['importance'] # res=utils.map_node_operation(res,f) res[0].graph['mass_annotate_mp_was_here'] = True return res else: pool = mp.Pool() mpres = [eden.apply_async(pool, mass_annotate_mp, args=(graphs, vectorizer, score_attribute, estimator)) for graphs in eden.grouper(inputs, 50)] result = [] for res in mpres: result += res.get() pool.close() pool.join() return result
def _make_multi_process_batches(self, problem_iter): ''' we do two things here: -break tasks into batches to be multiprocessed. -multiprocess sometimes does not terminate properly so we observe how many tasks go in and terminate once that number of outs is reached. Parameters ---------- problem_iter: problems to put into the multiprocess queue Returns ------- yields a batchsize sized problem chunks ''' try: s = dill.dumps(self, byref=False) except Exception as exc: print exc print "dill dump failed in graphlearn.py (dill dies silently sometimes)" self.multiprocess_jobcount = 0 self.multiprocess_all_prepared = False for e in grouper(problem_iter, self.batch_size): # cant just take batch size here because output of nons will be suppressed problems = [1 for problem in e if problem != None] self.multiprocess_jobcount += sum(problems) batch = dill.dumps(e) yield (s, batch) self.multiprocess_all_prepared = True
def _multi_process_argbuilder(self, graphs, batch_size=10): args = self._get_args() function = self.get_cip_extractor() self.multiprocess_jobcount = 0 self.mp_prepared = False for batch in grouper(graphs, batch_size): self.multiprocess_jobcount += batch_size yield dill.dumps((function, args, batch)) self.mp_prepared = True
def mass_annotate_mp(inputs, vectorizer, score_attribute='importance', estimator=None, multi_process=False, annotate_dilude_scores=False): ''' graph annotation is slow. i dont want to do it twice in fit and predict :) ''' # 1st check if already annotated #if inputs[0].graph.get('mass_annotate_mp_was_here', False): # return inputs if multi_process == False: inputs = filter(lambda v: v is not None, inputs) res = list(vectorizer.annotate(inputs, estimator=estimator)) def dilute_graph( graph): for n, d in graph.nodes(data=True): neighsum = [graph.node[other][score_attribute][0] for other in graph.neighbors(n)] if neighsum != []: allfacs = neighsum + [graph.node[n][score_attribute][0]] * len(neighsum) score = sum(allfacs) / float(len(allfacs)) else: score = d[score_attribute][0] d['tmpscore'] = score for n, d in graph.nodes(data=True): d[score_attribute] = [d['tmpscore'], 0] # self.attribute = lambda x: x['tmpscore'] if annotate_dilude_scores: map(dilute_graph,res) #if invert_score: # def f(n,d): d['importance'] = -d['importance'] # res=utils.map_node_operation(res,f) res[0].graph['mass_annotate_mp_was_here'] = True return res else: pool = mp.Pool() mpres = [eden.apply_async(pool, mass_annotate_mp, args=(graphs, vectorizer, score_attribute, estimator)) for graphs in eden.grouper(inputs, 50)] result = [] for res in mpres: result += res.get() pool.close() pool.join() return result
def _multi_process_argbuilder(self, graphs, batch_size=10): args = self._get_args() function = self.get_cip_extractor() for batch in grouper(graphs, batch_size): yield dill.dumps((function, args, batch))
def _argbuilder(self, problem_iter): # for multiprocessing divide task into small multiprocessable bites s = dill.dumps(self) for e in grouper(problem_iter, self.batch_size): batch = dill.dumps(e) yield (s, batch)