Ejemplo n.º 1
0
 def get_guesses (sc, input_dir, partitions, articles, slices=1, slice_n=1):
     from chemotext_util import LoggingUtil
     logger = LoggingUtil.init_logging (__file__)
     slice_size = int (len (articles) / slices)
     offset = slice_size * slice_n
     rest = len(articles) - offset
     if rest > slice_size and rest <= 2 * slice_size:
         slice_size = rest
     the_slice = articles [ offset : offset + slice_size ]
     logger.info ("   -- Guesses (input:{0}, articles:{1}, slice_size:{2}, offset:{3})".
                  format (input_dir, len(articles), slice_size, offset)) 
     articles = sc.parallelize (the_slice, partitions).  \
                flatMap (lambda p : EquivalentSet.get_article (p)).\
                sample (False, debug_scale).\
                cache ()
     return (
         articles.flatMap (Guesses.get_article_guesses).cache (),
         articles.map (lambda a : a.id).collect ()
     )