Beispiel #1
0
def mass_annotate_mp(inputs, vectorizer, score_attribute='importance', estimator=None, multi_process=False, invert_score=False):
    '''
    graph annotation is slow. i dont want to do it twice in fit and predict :)
    '''
    #  1st check if already annotated
    if inputs[0].graph.get('mass_annotate_mp_was_here', False):
        return inputs

    if multi_process == False:
        inputs = filter(lambda v: v is not None, inputs)
        res = list(vectorizer.annotate(inputs, estimator=estimator))
        #if invert_score:
        #    def f(n,d): d['importance'] = -d['importance']
        #    res=utils.map_node_operation(res,f)

        res[0].graph['mass_annotate_mp_was_here'] = True
        return res
    else:
        pool = mp.Pool()
        mpres = [eden.apply_async(pool, mass_annotate_mp, args=(graphs, vectorizer, score_attribute, estimator)) for
                 graphs in eden.grouper(inputs, 50)]
        result = []
        for res in mpres:
            result += res.get()
        pool.close()
        pool.join()
        return result
Beispiel #2
0
    def _make_multi_process_batches(self, problem_iter):
        '''
        we do two things here:
        -break tasks into batches to be multiprocessed.
        -multiprocess sometimes does not terminate properly so we observe how many tasks go in and terminate
        once that number of outs is reached.

        Parameters
        ----------
        problem_iter: problems to put into the multiprocess queue

        Returns
        -------
            yields a batchsize sized problem chunks
        '''
        try:
            s = dill.dumps(self, byref=False)
        except Exception as exc:
            print exc
            print "dill dump failed in graphlearn.py (dill dies silently sometimes)"
        self.multiprocess_jobcount = 0
        self.multiprocess_all_prepared = False

        for e in grouper(problem_iter, self.batch_size):
            # cant just take batch size here because output of nons will be suppressed
            problems = [1 for problem in e if problem != None]
            self.multiprocess_jobcount += sum(problems)
            batch = dill.dumps(e)
            yield (s, batch)
        self.multiprocess_all_prepared = True
    def _multi_process_argbuilder(self, graphs, batch_size=10):

        args = self._get_args()
        function = self.get_cip_extractor()
        self.multiprocess_jobcount = 0
        self.mp_prepared = False
        for batch in grouper(graphs, batch_size):
            self.multiprocess_jobcount += batch_size
            yield dill.dumps((function, args, batch))
        self.mp_prepared = True
Beispiel #4
0
def mass_annotate_mp(inputs, vectorizer, score_attribute='importance', estimator=None, multi_process=False, annotate_dilude_scores=False):
    '''
    graph annotation is slow. i dont want to do it twice in fit and predict :)
    '''

    #  1st check if already annotated
    #if inputs[0].graph.get('mass_annotate_mp_was_here', False):
    #    return inputs

    if multi_process == False:
        inputs = filter(lambda v: v is not None, inputs)


        res = list(vectorizer.annotate(inputs, estimator=estimator))

        def dilute_graph( graph):
            for n, d in graph.nodes(data=True):
                neighsum = [graph.node[other][score_attribute][0] for other in graph.neighbors(n)]
                if neighsum != []:
                    allfacs = neighsum + [graph.node[n][score_attribute][0]] * len(neighsum)
                    score = sum(allfacs) / float(len(allfacs))
                else:
                    score = d[score_attribute][0]
                d['tmpscore'] = score

            for n, d in graph.nodes(data=True):
                d[score_attribute] = [d['tmpscore'], 0]
                # self.attribute =  lambda x: x['tmpscore']
        if annotate_dilude_scores:
            map(dilute_graph,res)

        #if invert_score:
        #    def f(n,d): d['importance'] = -d['importance']
        #    res=utils.map_node_operation(res,f)

        res[0].graph['mass_annotate_mp_was_here'] = True
        return res
    else:
        pool = mp.Pool()
        mpres = [eden.apply_async(pool, mass_annotate_mp, args=(graphs, vectorizer, score_attribute, estimator)) for
                 graphs in eden.grouper(inputs, 50)]
        result = []
        for res in mpres:
            result += res.get()
        pool.close()
        pool.join()
        return result
 def _multi_process_argbuilder(self, graphs, batch_size=10):
     args = self._get_args()
     function = self.get_cip_extractor()
     for batch in grouper(graphs, batch_size):
         yield dill.dumps((function, args, batch))
Beispiel #6
0
 def _argbuilder(self, problem_iter):
     # for multiprocessing  divide task into small multiprocessable bites
     s = dill.dumps(self)
     for e in grouper(problem_iter, self.batch_size):
         batch = dill.dumps(e)
         yield (s, batch)
 def _multi_process_argbuilder(self, graphs, batch_size=10):
     args = self._get_args()
     function = self.get_cip_extractor()
     for batch in grouper(graphs, batch_size):
         yield dill.dumps((function, args, batch))