def pruneRelativeEntropy(filename, outfile): pt = PhraseTable(fileLines(filename)) mapFn = lambda line: computeRelEnt(pt, line) with timer('pruning') as tim: with openMaybeGz(outfile, 'w') as o: count = 0 chunksize = 100 for line in threaded_map(mapFn, fileChunks(filename, chunksize), threadCount = 6, maxInputQ = 1024): o.write(line) count += chunksize if 0 == count % 500: (elapsed, remaining, totalTime) = tim.predict(count, pt.count) print "{0:.3f} elapsed; {1:.3f} remaining; {2:.3f} total; count = {3} \r".format(elapsed, remaining, totalTime, count), ; stdout.flush()
def fileChunks(filename, chunkSize): with openMaybeGz(filename, 'r') as fh: chunk = '' ccount = 0 for line in fh: chunk += line ccount += 1 if ccount == chunkSize: yield chunk ccount = 0 chunk = '' if ccount > 0: yield chunk