def get_guesses (sc, input_dir, partitions, articles, slices=1, slice_n=1): from chemotext_util import LoggingUtil logger = LoggingUtil.init_logging (__file__) slice_size = int (len (articles) / slices) offset = slice_size * slice_n rest = len(articles) - offset if rest > slice_size and rest <= 2 * slice_size: slice_size = rest the_slice = articles [ offset : offset + slice_size ] logger.info (" -- Guesses (input:{0}, articles:{1}, slice_size:{2}, offset:{3})". format (input_dir, len(articles), slice_size, offset)) articles = sc.parallelize (the_slice, partitions). \ flatMap (lambda p : EquivalentSet.get_article (p)).\ sample (False, debug_scale).\ cache () return ( articles.flatMap (Guesses.get_article_guesses).cache (), articles.map (lambda a : a.id).collect () )