def write_permacache(cls, fd=STDIN, out=STDOUT, num=1000): """Write computed listings (from fd) to permacache. :param int num: maximum listing size :param file fd: input stream """ mr_tools.mr_reduce_max_per_key( cls._sorting_key, num=num, post=cls.store_keys, fd=fd, out=out )
def write_permacache(cls, fd=STDIN, out=STDOUT, num=1000): """Write computed listings (from fd) to permacache. :param int num: maximum listing size :param file fd: input stream """ mr_tools.mr_reduce_max_per_key(cls._sorting_key, num=num, post=cls.store_keys, fd=fd, out=out)
def top1k_writefiles(dirname): """Divide up the top 1k of each key into its own file to make restarting after a failure much easier""" def post(key, maxes): with open(os.path.join(dirname, key), 'w') as f: for item in maxes: f.write('%s\t' % key) f.write('\t'.join(item)) f.write('\n') mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]), num=1000, post=post)
def reduce_listings(cls, fd=STDIN, out=STDOUT, num=1000): """Debugging reducer. Like write_permacache, but just sends the reduced version of the listing to stdout instead of to the permacache. It's handy for debugging to see the final result before it's written out :param int num: maximum listing size :param file fd: input stream """ mr_tools.mr_reduce_max_per_key( cls._sorting_key, num=num, fd=fd, out=out )
def reduce_listings(cls, fd=STDIN, out=STDOUT, num=1000): """Debugging reducer. Like write_permacache, but just sends the reduced version of the listing to stdout instead of to the permacache. It's handy for debugging to see the final result before it's written out :param int num: maximum listing size :param file fd: input stream """ mr_tools.mr_reduce_max_per_key(cls._sorting_key, num=num, fd=fd, out=out)
def top1k_writefiles(dirname): """Divide up the top 1k of each key into its own file to make restarting after a failure much easier. Pairs with write_permacache_from_dir""" def hashdir(name, levels=[3]): # levels is a list of how long each stage if the hashdirname # should be. So [2,2] would make dirs like # 'ab/cd/thelisting.txt' (and this function would just return # the string 'ab/cd', so that you have the dirname that you # can create before os.path.joining to the filename) h = md5(name).hexdigest() last = 0 dirs = [] for l in levels: dirs.append(h[last:last + l]) last += l return os.path.join(*dirs) def post(key, maxes): # we're taking a hash like 12345678901234567890123456789012 # and making a directory name two deep out of the first half # of the characters. We may want to tweak this as the number # of listings hd = os.path.join(dirname, hashdir(key)) try: os.makedirs(hd) except OSError as e: if e.errno != errno.EEXIST: raise filename = os.path.join(hd, key) with open(filename, 'w') as f: for item in maxes: f.write('%s\t' % key) f.write('\t'.join(item)) f.write('\n') mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]), num=1000, post=post)
def top1k_writefiles(dirname): """Divide up the top 1k of each key into its own file to make restarting after a failure much easier. Pairs with write_permacache_from_dir""" def hashdir(name, levels=[3]): # levels is a list of how long each stage if the hashdirname # should be. So [2,2] would make dirs like # 'ab/cd/thelisting.txt' (and this function would just return # the string 'ab/cd', so that you have the dirname that you # can create before os.path.joining to the filename) h = md5(name).hexdigest() last = 0 dirs = [] for l in levels: dirs.append(h[last : last + l]) last += l return os.path.join(*dirs) def post(key, maxes): # we're taking a hash like 12345678901234567890123456789012 # and making a directory name two deep out of the first half # of the characters. We may want to tweak this as the number # of listings hd = os.path.join(dirname, hashdir(key)) try: os.makedirs(hd) except OSError as e: if e.errno != errno.EEXIST: raise filename = os.path.join(hd, key) with open(filename, "w") as f: for item in maxes: f.write("%s\t" % key) f.write("\t".join(item)) f.write("\n") mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]), num=1000, post=post)
def top1k_writepermacache(fd=sys.stdin): mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]), num=1000, post=store_keys, fd=fd)
def write_permacache(fd = sys.stdin): mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]), num=1000, post=store_keys, fd = fd)
def reduce_listings(fd=sys.stdin): # like write_permacache, but just sends the reduced version of the listing # to stdout instead of to the permacache. It's handy for debugging to see # the final result before it's written out mr_tools.mr_reduce_max_per_key(lambda x: map(float, x[:-1]), num=1000, fd = fd)