Esempio n. 1
0
def main(test, train):
    need = streamToCounts(wordsInStream(fileLines(test)))
    print len(need)
    count = 0
    have = dict()
    lastAdded = ''
    for tok in wordsInStream(fileLines(train)):
        count += 1
        if tok in need and tok not in have:
            have[tok] = 1
            lastAdded = tok
        if (count % 1000000) == 0:
            print str(len(have)) + " / " + str(len(need)) + ": " + lastAdded
    print count
    for p in need.iteritems():
        if p[0] not in have:
            print p[0]
def pruneRelativeEntropy(filename, outfile):
    pt = PhraseTable(fileLines(filename))
    mapFn = lambda line: computeRelEnt(pt, line)
    with timer('pruning') as tim:
        with openMaybeGz(outfile, 'w') as o:
            count = 0
            chunksize = 100
            for line in threaded_map(mapFn, fileChunks(filename, chunksize), threadCount = 6, maxInputQ = 1024):
                o.write(line)
                count += chunksize
                if 0 == count % 500:
                    (elapsed, remaining, totalTime) = tim.predict(count, pt.count)
                    print "{0:.3f} elapsed; {1:.3f} remaining; {2:.3f} total; count = {3}  \r".format(elapsed, remaining, totalTime, count), ; stdout.flush()
Esempio n. 3
0
def getUnigrams(filename, mincount = 0):
    d = dict()
    try:
        for line in fileLines(filename):
            pcs = line.strip().split(' ')
            if len(pcs) != 2: continue
            key = pcs[1]
            if len(key) == 0: continue
            d[key] = d.get(key, 0) + int(pcs[0])
        if mincount > 0:
            d2 = dict()
            for p in d.iteritems():
                if p[1] < mincount: continue
                d2[p[0]] = p[1]
            d = d2
    except IOError as e:
        print "I/O error({0}): {1}".format(e.errno, e.strerror)
        print "error reading from " + filename
    return d