def main(test, train): need = streamToCounts(wordsInStream(fileLines(test))) print len(need) count = 0 have = dict() lastAdded = '' for tok in wordsInStream(fileLines(train)): count += 1 if tok in need and tok not in have: have[tok] = 1 lastAdded = tok if (count % 1000000) == 0: print str(len(have)) + " / " + str(len(need)) + ": " + lastAdded print count for p in need.iteritems(): if p[0] not in have: print p[0]
def pruneRelativeEntropy(filename, outfile): pt = PhraseTable(fileLines(filename)) mapFn = lambda line: computeRelEnt(pt, line) with timer('pruning') as tim: with openMaybeGz(outfile, 'w') as o: count = 0 chunksize = 100 for line in threaded_map(mapFn, fileChunks(filename, chunksize), threadCount = 6, maxInputQ = 1024): o.write(line) count += chunksize if 0 == count % 500: (elapsed, remaining, totalTime) = tim.predict(count, pt.count) print "{0:.3f} elapsed; {1:.3f} remaining; {2:.3f} total; count = {3} \r".format(elapsed, remaining, totalTime, count), ; stdout.flush()
def getUnigrams(filename, mincount = 0): d = dict() try: for line in fileLines(filename): pcs = line.strip().split(' ') if len(pcs) != 2: continue key = pcs[1] if len(key) == 0: continue d[key] = d.get(key, 0) + int(pcs[0]) if mincount > 0: d2 = dict() for p in d.iteritems(): if p[1] < mincount: continue d2[p[0]] = p[1] d = d2 except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) print "error reading from " + filename return d