Ejemplo n.º 1
0
def indexed_weights():

    global _indexed_weights
    if _indexed_weights is not None:
        return _indexed_weights
    print >> sys.stderr, len(
        wordmap.map), "=?=", HYPERPARAMETERS["VOCABULARY_SIZE"]
    assert len(wordmap.map) == HYPERPARAMETERS["VOCABULARY_SIZE"]
    if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0:
        _indexed_weights = [1 for id in range(wordmap.len)]
    elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1:
        from common.json import load
        from common.file import myopen
        ngrams_file = HYPERPARAMETERS["NGRAMS"][(
            HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"],
            HYPERPARAMETERS["VOCABULARY_SIZE"])]
        print >> sys.stderr, "Reading ngrams from", ngrams_file, "..."
        from collections import defaultdict
        ngramcnt = defaultdict(int)
        for (ngram, cnt) in load(myopen(ngrams_file)):
            assert len(ngram) == 1
            ngramcnt[ngram[0]] = cnt + HYPERPARAMETERS[
                "TRAINING_NOISE_SMOOTHING_ADDITION"]
        _indexed_weights = [
            ngramcnt[wordmap.str(id)] for id in range(len(wordmap.map))
        ]
        _indexed_weights = build(_indexed_weights)
    else:
        assert 0
    return _indexed_weights
def visualize(cnt, embeddings, rundir, idxs, str):
    """
    Visualize a set of examples using t-SNE.
    """
    from vocabulary import wordmap
    PERPLEXITY=30

    x = embeddings[idxs]
    print x.shape
    titles = [wordmap.str(id) for id in idxs]

    import os.path
    filename = os.path.join(rundir, "embeddings-%s-%d.png" % (str, cnt))
    try:
        from textSNE.calc_tsne import tsne
#       from textSNE.tsne import tsne
        out = tsne(x, perplexity=PERPLEXITY)
        from textSNE.render import render
        render([(title, point[0], point[1]) for title, point in zip(titles, out)], filename)
    except IOError:
        logging.info("ERROR visualizing", filename, ". Continuing...")
Ejemplo n.º 3
0
def indexed_weights():
    
    global _indexed_weights
    if _indexed_weights is not None:
        return _indexed_weights
    print >> sys.stderr, len(wordmap.map), "=?=", HYPERPARAMETERS["VOCABULARY_SIZE"]
    assert len(wordmap.map) == HYPERPARAMETERS["VOCABULARY_SIZE"]
    if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0:
        _indexed_weights = [1 for id in range(wordmap.len)]
    elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1:
        from common.json import load
        from common.file import myopen
        ngrams_file = HYPERPARAMETERS["NGRAMS"][(HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"], HYPERPARAMETERS["VOCABULARY_SIZE"])]
        print >> sys.stderr, "Reading ngrams from", ngrams_file, "..."
        from collections import defaultdict
        ngramcnt = defaultdict(int)
        for (ngram, cnt) in load(myopen(ngrams_file)):
            assert len(ngram) == 1
            ngramcnt[ngram[0]] = cnt + HYPERPARAMETERS["TRAINING_NOISE_SMOOTHING_ADDITION"]
        _indexed_weights = [ngramcnt[wordmap.str(id)] for id in range(len(wordmap.map))]
        _indexed_weights = build(_indexed_weights)
    else: assert 0
    return _indexed_weights
Ejemplo n.º 4
0
        for ebatch in get_train_minibatch:
            cnt += len(ebatch)
            #for e in ebatch:
                #print [wordmap.str(id) for id in e]
                #print e
            m.train(ebatch)
            #validate(cnt)
            if cnt % (int(1000./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0:
                logging.info("Finished training step %d (epoch %d)" % (cnt, epoch))
#                print ("Finished training step %d (epoch %d)" % (cnt, epoch))
            if cnt % (int(100000./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0:
                if os.path.exists(os.path.join(rundir, "BAD")):
                    logging.info("Detected file: %s\nSTOPPING" % os.path.join(rundir, "BAD"))
                    sys.stderr.write("Detected file: %s\nSTOPPING\n" % os.path.join(rundir, "BAD"))
                    sys.exit(0)
            if cnt % (int(HYPERPARAMETERS["VALIDATE_EVERY"]*1./HYPERPARAMETERS["MINIBATCH SIZE"])*HYPERPARAMETERS["MINIBATCH SIZE"]) == 0:
                state.save(m, cnt, epoch, get_train_minibatch, rundir, newkeystr)                       
                validate(cnt)
        get_train_minibatch = examples.TrainingMinibatchStream()
        epoch += 1
    #output the embedding
    outfile=open(HYPERPARAMETERS["EMBEDDING_FILE"],'w')
    from vocabulary import wordmap
    for i in range(m.parameters.vocab_size):
        outfile.write(wordmap.str(i)+'\t')
        for v in m.parameters.embeddings[i]:
            outfile.write(str(v)+'\t')
        outfile.write('\n')
    outfile.flush()
    outfile.close()
Ejemplo n.º 5
0
#!/usr/bin/env python
"""
Dump n-gram counts over entire training data as YAML.
"""

import sys
from common.stats import stats
from hyperparameters import HYPERPARAMETERS
from collections import defaultdict
cnt = defaultdict(int)
if __name__ == "__main__":


    import vocabulary
    print >> sys.stderr, "Reading vocab"
    vocabulary.read()
    from vocabulary import wordmap

    import train
    for (i, e) in enumerate(train.get_train_example()):
        cnt[tuple([wordmap.str(t) for t in e])] += 1
        if i % 10000 == 0:
            print >> sys.stderr, "Read %d examples" % i
            print >> sys.stderr, stats()
        if i > 100000000:
            break
    cnt = [(t, cnt[t]) for t in cnt]
    import common.json
    common.json.dump(cnt, sys.stdout)
        for i in range(len(tokens)):
            for j, context in enumerate(HYPERPARAMETERS["CONTEXT_TYPES"]):
                for k in context:
                    tokidx = i + k
                    if tokidx < 0 or tokidx >= len(tokens): continue
                    random_representations[tokens[i]] += context_vectors[j][tokens[tokidx]]
        cnt += 1
        if cnt % 10000 == 0:
            diagnostics.diagnostics(cnt, random_representations)

    logging.info("DONE. Dividing embeddings by their standard deviation...")
    random_representations = random_representations * (1. / numpy.std(random_representations))
    diagnostics.diagnostics(cnt, random_representations)
    diagnostics.visualizedebug(cnt, random_representations, rundir, newkeystr)

    outfile = os.path.join(rundir, "random_representations")
    if newkeystr != "":
        verboseoutfile = os.path.join(rundir, "random_representations%s" % newkeystr)
        logging.info("Writing representations to %s, and creating link %s" % (outfile, verboseoutfile))
        os.system("ln -s random_representations %s " % (verboseoutfile))
    else:
        logging.info("Writing representations to %s, not creating any link because of default settings" % outfile)

    o = open(outfile, "wt")
    from vocabulary import wordmap
    for i in range(wordmap.len):
        o.write(wordmap.str(i) + " ")
        for v in random_representations[i]:
            o.write(`v` + " ")
        o.write("\n")
#!/usr/bin/env python

from optparse import OptionParser

parser = OptionParser()
parser.add_option("-m", "--modelfile", dest="modelfile")
(options, args) = parser.parse_args()
assert options.modelfile is not None

import cPickle

m = cPickle.load(open(options.modelfile))
# print m.parameters.embeddings.shape

from vocabulary import wordmap

for i in range(m.parameters.vocab_size):
    print wordmap.str(i),
    for v in m.parameters.embeddings[i]:
        print v,
    print
Ejemplo n.º 8
0
#!/usr/bin/env python

from optparse import OptionParser
parser = OptionParser()
parser.add_option("-m", "--modelfile", dest="modelfile")
(options, args) = parser.parse_args()
assert options.modelfile is not None

import cPickle
m = cPickle.load(open(options.modelfile))
#print m.parameters.embeddings.shape

from vocabulary import wordmap
for i in range(m.parameters.vocab_size):
    print wordmap.str(i),
    for v in m.parameters.embeddings[i]:
        print v,
    print
Ejemplo n.º 9
0
                      HYPERPARAMETERS["MINIBATCH SIZE"]) == 0:
                logging.info("Finished training step %d (epoch %d)" %
                             (cnt, epoch))
#                print ("Finished training step %d (epoch %d)" % (cnt, epoch))
            if cnt % (int(100000. / HYPERPARAMETERS["MINIBATCH SIZE"]) *
                      HYPERPARAMETERS["MINIBATCH SIZE"]) == 0:
                if os.path.exists(os.path.join(rundir, "BAD")):
                    logging.info("Detected file: %s\nSTOPPING" %
                                 os.path.join(rundir, "BAD"))
                    sys.stderr.write("Detected file: %s\nSTOPPING\n" %
                                     os.path.join(rundir, "BAD"))
                    sys.exit(0)
            if cnt % (int(HYPERPARAMETERS["VALIDATE_EVERY"] * 1. /
                          HYPERPARAMETERS["MINIBATCH SIZE"]) *
                      HYPERPARAMETERS["MINIBATCH SIZE"]) == 0:
                state.save(m, cnt, epoch, get_train_minibatch, rundir,
                           newkeystr)
                validate(cnt)
        get_train_minibatch = examples.TrainingMinibatchStream()
        epoch += 1
    #output the embedding
    outfile = open(HYPERPARAMETERS["EMBEDDING_FILE"], 'w')
    from vocabulary import wordmap
    for i in range(m.parameters.vocab_size):
        outfile.write(wordmap.str(i) + '\t')
        for v in m.parameters.embeddings[i]:
            outfile.write(str(v) + '\t')
        outfile.write('\n')
    outfile.flush()
    outfile.close()
Ejemplo n.º 10
0
                        tokens[tokidx]]
        cnt += 1
        if cnt % 10000 == 0:
            diagnostics.diagnostics(cnt, random_representations)

    logging.info("DONE. Dividing embeddings by their standard deviation...")
    random_representations = random_representations * (
        1. / numpy.std(random_representations))
    diagnostics.diagnostics(cnt, random_representations)
    diagnostics.visualizedebug(cnt, random_representations, rundir, newkeystr)

    outfile = os.path.join(rundir, "random_representations")
    if newkeystr != "":
        verboseoutfile = os.path.join(rundir,
                                      "random_representations%s" % newkeystr)
        logging.info("Writing representations to %s, and creating link %s" %
                     (outfile, verboseoutfile))
        os.system("ln -s random_representations %s " % (verboseoutfile))
    else:
        logging.info(
            "Writing representations to %s, not creating any link because of default settings"
            % outfile)

    o = open(outfile, "wt")
    from vocabulary import wordmap
    for i in range(wordmap.len):
        o.write(wordmap.str(i) + " ")
        for v in random_representations[i]:
            o.write( ` v ` + " ")
        o.write("\n")
Ejemplo n.º 11
0
Dump n-gram counts over entire training data as YAML.
"""

import sys
from common.stats import stats

from collections import defaultdict
cnt = defaultdict(int)
if __name__ == "__main__":
    import common.hyperparameters, common.options
    HYPERPARAMETERS = common.hyperparameters.read("language-model")
    HYPERPARAMETERS, options, args = common.options.reparse(HYPERPARAMETERS)
    import hyperparameters

    import vocabulary
    print >> sys.stderr, "Reading vocab"
    vocabulary.read()
    from vocabulary import wordmap

    import train
    for (i, e) in enumerate(train.get_train_example()):
        cnt[tuple([wordmap.str(t) for t in e])] += 1
        if i % 10000 == 0:
            print >> sys.stderr, "Read %d examples" % i
            print >> sys.stderr, stats()
        if i > 100000000:
            break
    cnt = [(t, cnt[t]) for t in cnt]
    import common.json
    common.json.dump(cnt, sys.stdout)