def generate_context_vectors():
    """
    Generate the (random) context vectors.
    """

    HYPERPARAMETERS = common.hyperparameters.read("random-indexing")
    from vocabulary import wordmap

    if HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "gaussian":
        context_vectors = [numpy.random.normal(size=(wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"])) for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"]))]
    elif HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "ternary":
        NONZEROS = int(HYPERPARAMETERS["TERNARY_NON_ZERO_PERCENT"] * HYPERPARAMETERS["REPRESENTATION_SIZE"] + 0.5)
    
        logging.info("Generating %d nonzeros per %d-length random context vector" % (NONZEROS, HYPERPARAMETERS["REPRESENTATION_SIZE"]))
    
        # Generate one set of context vectors per list in HYPERPARAMETERS["CONTEXT_TYPES"]
        context_vectors = []
        for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"])):
            logging.info("Generated %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"]))))
            logging.info(stats())
            thiscontext = numpy.zeros((wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"]))
            for j in range(wordmap.len):
                idxs = range(HYPERPARAMETERS["REPRESENTATION_SIZE"])
                random.shuffle(idxs)
                for k in idxs[:NONZEROS]:
                    thiscontext[j][k] = random.choice([-1, +1])
    #            print thiscontext[j]
            context_vectors.append(thiscontext)
    else:
        assert 0
    
    logging.info("Done generating %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"]))))
    logging.info(stats())
    return context_vectors
def bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign):
    """
    Given languages l1 and l2 and their bicorpus filenames f1, f2, and falign,
    yield tuples of the former (ws1, ws2, links),
    where ws1 are the word ids in the sentence from f1,
    where ws2 are the word ids in the sentence from f2,
    and links is a list of (i1, i2) word indexes that are linked.
    """
    from w2w.vocabulary import wordmap

    i = 0
    emptycnt = 0
    logging.info("Reading %s,%s sentences and alignments from %s, %s, %s" %
                 (l1, l2, f1, f2, falign))
    fil1, fil2, filalign = open(f1), open(f2), open(falign)
    for (s1, s2, salign) in itertools.izip(fil1, fil2, filalign):
        #     print s1, s2, salign,
        i += 1
        if i % 100000 == 0:
            logging.info("\tRead line %d of %s, %s, %s..." %
                         (i, f1, f2, falign))
            logging.info("\tEmpty sentences are %s..." %
                         (percent(emptycnt, i)))
            logging.info("\t%s" % stats())

        ws1 = [(l1, w1) for w1 in string.split(s1)]
        ws2 = [(l2, w2) for w2 in string.split(s2)]
        ws1 = [wordmap().id(tok) for tok in ws1]
        ws2 = [wordmap().id(tok) for tok in ws2]

        if len(ws1) == 0 or len(ws2) == 0:
            emptycnt += 1
            continue

#     print ws2, [w2w.vocabulary.wordmap.str(w2) for w2 in ws2]
        links = [string.split(link, sep="-") for link in string.split(salign)]
        links = [(int(i1), int(i2)) for i1, i2 in links]

        yield ws1, ws2, links

    # Make sure all iterators are exhausted
    alldone = 0
    try:
        value = fil1.next()
    except StopIteration:
        alldone += 1
    try:
        value = fil2.next()
    except StopIteration:
        alldone += 1
    try:
        value = filalign.next()
    except StopIteration:
        alldone += 1
    assert alldone == 3

    logging.info("DONE. Read line %d of %s, %s, %s..." % (i, f1, f2, falign))
    logging.info("Empty sentences are %s..." % (percent(emptycnt, i)))
    logging.info(stats())
Exemple #3
0
def print_aggregate(cnts):
    for k in cnts:
        print k
        tot = 0
        for v, k2 in dictsort(cnts[k]): tot += v
        for v, k2 in dictsort(cnts[k]):
            print "\t", percent(v, tot), k2
Exemple #4
0
def print_aggregate(cnts):
    for k in cnts:
        print k
        tot = 0
        for v, k2 in dictsort(cnts[k]):
            tot += v
        for v, k2 in dictsort(cnts[k]):
            print "\t", percent(v, tot), k2
Exemple #5
0
def generate_context_vectors():
    """
    Generate the (random) context vectors.
    """

    HYPERPARAMETERS = common.hyperparameters.read("random-indexing")
    from vocabulary import wordmap

    if HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "gaussian":
        context_vectors = [
            numpy.random.normal(size=(wordmap.len,
                                      HYPERPARAMETERS["REPRESENTATION_SIZE"]))
            for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"]))
        ]
    elif HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "ternary":
        NONZEROS = int(HYPERPARAMETERS["TERNARY_NON_ZERO_PERCENT"] *
                       HYPERPARAMETERS["REPRESENTATION_SIZE"] + 0.5)

        logging.info(
            "Generating %d nonzeros per %d-length random context vector" %
            (NONZEROS, HYPERPARAMETERS["REPRESENTATION_SIZE"]))

        # Generate one set of context vectors per list in HYPERPARAMETERS["CONTEXT_TYPES"]
        context_vectors = []
        for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"])):
            logging.info("Generated %s context matrixes" %
                         (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"]))))
            logging.info(stats())
            thiscontext = numpy.zeros(
                (wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"]))
            for j in range(wordmap.len):
                idxs = range(HYPERPARAMETERS["REPRESENTATION_SIZE"])
                random.shuffle(idxs)
                for k in idxs[:NONZEROS]:
                    thiscontext[j][k] = random.choice([-1, +1])
    #            print thiscontext[j]
            context_vectors.append(thiscontext)
    else:
        assert 0

    logging.info("Done generating %s context matrixes" %
                 (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"]))))
    logging.info(stats())
    return context_vectors
def embeddings_debug(w, cnt, str):
    """
    Output the l2norm mean and max of the embeddings, including in debug out the str and training cnt
    """
    totalcnt = numpy.sum(numpy.abs(w) >= 0)
    notsmallcnt = numpy.sum(numpy.abs(w) >= 0.1)
    logging.info("%d %s dimensions of %s have absolute value >= 0.1" % (cnt, percent(notsmallcnt, totalcnt), str))
    notsmallcnt = numpy.sum(numpy.abs(w) >= 0.01)
    logging.info("%d %s dimensions of %s have absolute value >= 0.01" % (cnt, percent(notsmallcnt, totalcnt), str))

    l2norm = numpy.sqrt(numpy.square(w).sum(axis=1))
    median = numpy.median(l2norm)
    mean = numpy.mean(l2norm)
    std = numpy.std(l2norm)
#    print("%d l2norm of top 100 words: mean = %f stddev=%f" % (cnt, numpy.mean(l2norm), numpy.std(l2norm),))
    l2norm = l2norm.tolist()
    l2norm.sort()
    l2norm.reverse()
    logging.info("%d l2norm of %s: median = %f mean = %f stddev=%f top3=%s" % (cnt, str, median, mean, std, `l2norm[:3]`))
def embeddings_debug(w, cnt, str):
    """
    Output the l2norm mean and max of the embeddings, including in debug out the str and training cnt
    """
    totalcnt = numpy.sum(numpy.abs(w) >= 0)
    notsmallcnt = numpy.sum(numpy.abs(w) >= 0.1)
    logging.info("%d %s dimensions of %s have absolute value >= 0.1" % (cnt, percent(notsmallcnt, totalcnt), str))
    notsmallcnt = numpy.sum(numpy.abs(w) >= 0.01)
    logging.info("%d %s dimensions of %s have absolute value >= 0.01" % (cnt, percent(notsmallcnt, totalcnt), str))

    l2norm = numpy.sqrt(numpy.square(w).sum(axis=1))
    median = numpy.median(l2norm)
    mean = numpy.mean(l2norm)
    std = numpy.std(l2norm)
#    print("%d l2norm of top 100 words: mean = %f stddev=%f" % (cnt, numpy.mean(l2norm), numpy.std(l2norm),))
    l2norm = l2norm.tolist()
    l2norm.sort()
    l2norm.reverse()
    logging.info("%d l2norm of %s: median = %f mean = %f stddev=%f top3=%s" % (cnt, str, median, mean, std, `l2norm[:3]`))
def onlineproject(x, dimensions, seed, randomization_type):
    # Online (low-memory) random projection

    newx = numpy.zeros((x.shape[0], dimensions))

    nonzeros = x.nonzero()  # (list of rows, list of cols) of all nonzeros

    # (col, row) of all nonzeros
    # We reorder like this so that we can group all columns together, and look up the randomrow for each column feature only once.
    nonzero_colrow = [(nonzeros[1][l], nonzeros[0][l])
                      for l in range(len(nonzeros[0]))]
    nonzero_colrow.sort()
    nonzero_colrow.reverse()

    randrow_key = None
    randrow_values = None
    randrows_computed = 0
    for l, (col, row) in enumerate(nonzero_colrow):
        if randrow_key != col:
            randrow_key = col
            randrow_values = pyrandomprojection.randomrow(
                key=col,
                dimensions=dimensions,
                RANDOMIZATION_TYPE=randomization_type,
                RANDOM_SEED=seed)

            randrows_computed += 1
            if randrows_computed % 500 == 0:
                print >> sys.stderr, "Retrieved %s random rows thus far, done with %s of nonzeroes on %s..." % (
                    percent(randrows_computed, x.shape[1]),
                    percent(l + 1, len(nonzero_colrow)), f)
                print >> sys.stderr, stats()
        newrow = x[row, col] * randrow_values
        assert newx[row].shape == newrow.shape
        newx[row] += newrow


#        if (l+1) % 10000 == 0:
#            print >> sys.stderr, "Done with %s of nonzeroes on %s..." % (percent(l+1, len(nonzero_colrow)), f)
#            print >> sys.stderr, stats()
    return newx
Exemple #9
0
def print_aggregate_compare(cnts, cntsmore):
    """
    Compare the hyperparams in the TOP jobs to the hyperparams in the MORE jobs.
    """
    cntscopy = copy.deepcopy(cnts)
    for k in cnts:
        print k
        for k2 in cnts[k].keys():
            cntscopy[k][k2] = (1. * cnt[k][k2]/cntsmore[k][k2], cnts[k][k2], cntsmore[k][k2])
        maxperc = dictsort(cntscopy[k])[0][0][0]
        for v, k2 in dictsort(cntscopy[k]):
            # The second column (v[0]/maxperc) is a score for how good this hyperparam is.
            print "\t", k2, "\t", "%.2f" % (v[0]/maxperc), "\t", percent(v[1], v[2], rev=True)
Exemple #10
0
def print_aggregate_compare(cnts, cntsmore):
    """
    Compare the hyperparams in the TOP jobs to the hyperparams in the MORE jobs.
    """
    cntscopy = copy.deepcopy(cnts)
    for k in cnts:
        print k
        for k2 in cnts[k].keys():
            cntscopy[k][k2] = (1. * cnt[k][k2] / cntsmore[k][k2], cnts[k][k2],
                               cntsmore[k][k2])
        maxperc = dictsort(cntscopy[k])[0][0][0]
        for v, k2 in dictsort(cntscopy[k]):
            # The second column (v[0]/maxperc) is a score for how good this hyperparam is.
            print "\t", k2, "\t", "%.2f" % (v[0] / maxperc), "\t", percent(
                v[1], v[2], rev=True)
def validate(translation_model, cnt):
    import math
#    logranks = []
#    logging.info("BEGINNING VALIDATION AT TRAINING STEP %d" % cnt)
#    logging.info(stats())
    i = 0
    tot = 0
    correct = 0
    for (i, ve) in enumerate(w2w.examples.get_all_validation_examples_cached()):
        correct_sequences, noise_sequences, weights = ebatch_to_sequences([ve])
        source_language = ve.l1
        is_correct = translation_model[source_language].validate_errors(correct_sequences, noise_sequences)
#        print r
        for w in weights: assert w == 1.0

        tot += 1
        if is_correct: correct += 1

        if i % 1000 == 0: logging.info("\tvalidating %d examples done..." % i)
#    logging.info("Validation of model %s at cnt %d: validation err %s" % (translation_model[source_language].modelname, cnt, percent(correct, tot)))
    logging.info("VALIDATION of model at cnt %d: validation accuracy %s" % (cnt, percent(correct, tot)))
Exemple #12
0
                print >> sys.stderr, "WEIRD WORD: %s" % word
            word = string.lower(word)
        assert len(vals[1:]) == HYPERPARAMETERS["EMBEDDING_SIZE"]
        tot += 1
        if tot % 10000 == 0:
            print >> sys.stderr, "\tRead %d lines from %s" % (
                tot, HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"])
        if word in original_embeddings:
            #            print >> sys.stderr, "Skipping word %s (originally %s), we already have an embedding for it" % (word, vals[0])
            continue
        else:
            original_embeddings[word] = numpy.array(
                [float(v) for v in vals[1:]])
    print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS[
        "W2W INITIAL EMBEDDINGS"]
    print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(
        tot - len(original_embeddings), tot)
    print >> sys.stderr, stats()

    reversemap = targetmap(name="reverse")

    embeddings = numpy.zeros(
        (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]))
    assert embeddings.shape == (wordmap().len,
                                HYPERPARAMETERS["EMBEDDING_SIZE"])

    ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]
    for w in range(wordmap().len):
        embedding = None
        # If this word is in a different language than the embeddings.
        if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]:
            if w not in reversemap:
    lucene.initVM()
    # create an index called 'index-dir' in a temp directory
#    indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
#                            'index-dir')
#    indexDir = "/Tmp/REMOVEME.index-dir"
    indexDir = "lucene.ukwac"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    queryparser = QueryParser(Version.LUCENE_30, "text", analyzer)
    searcher = IndexSearcher(dir)

    nonzeros = 0

    for i, l in enumerate(sys.stdin):
        if i % 100 == 0:
            print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % (i, percent(nonzeros, BLOOM_FILTER_SIZE))
            print >> sys.stderr, stats()
        l = string.strip(l)
        
        added_this_sentence = 0
        for newl in retrieve(l, searcher, queryparser):
            # Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents
            if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT: break

            newl = string.strip(newl)

            # Hash the sentence
            idx = murmur.string_hash(newl.encode("utf-8")) % BLOOM_FILTER_SIZE
            # Don't use duplicate sentences
            if usedsentences[idx]: continue
        word = vals[0]
        if HYPERPARAMETERS["W2W LOWERCASE INITIAL EMBEDDINGS BEFORE INITIALIZATION"] and word != "*UNKNOWN*":
            if (word[0] == '*' and word[-1] == '*' and len(word) > 1):
                print >> sys.stderr, "WEIRD WORD: %s" % word
            word = string.lower(word)
        assert len(vals[1:]) == HYPERPARAMETERS["EMBEDDING_SIZE"]
        tot += 1
        if tot % 10000 == 0:
            print >> sys.stderr, "\tRead %d lines from %s" % (tot, HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"])
        if word in original_embeddings:
#            print >> sys.stderr, "Skipping word %s (originally %s), we already have an embedding for it" % (word, vals[0])
            continue
        else:
            original_embeddings[word] = numpy.array([float(v) for v in vals[1:]])
    print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]
    print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(tot-len(original_embeddings), tot)
    print >> sys.stderr, stats()

    reversemap = targetmap(name="reverse")

    embeddings = numpy.zeros((wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]))
    assert embeddings.shape == (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"])

    ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]
    for w in range(wordmap().len):
        embedding = None
        # If this word is in a different language than the embeddings.
        if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]:
            if w not in reversemap:
                print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % `wordmap().str(w)`
                embedding = original_embeddings["*UNKNOWN*"]
        w1 = wordmap().id(w1)
        # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD
        assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
        if language(w1) is None:
            print >> sys.stderr, "Skipping %s" % ` wordmap().str(w1) `
            continue
        if w1 not in targetmap():
            print >> sys.stderr, "Skipping %s, not a source word in targetmap" % ` wordmap(
            ).str(w1) `
            continue
        for l2 in targetmap()[w1]:
            totcnt = 0
            for cnt, w2 in dictsort(targetmap()[w1][l2]):
                totcnt += cnt
            print wordmap().str(w1), l2, [
                (percent(cnt, totcnt), wordform(w2))
                for cnt, w2 in dictsort(targetmap()[w1][l2])
            ]

    print >> sys.stderr, "REVERSE MAP NOW"

    for w1 in wordmap().all:
        w1 = wordmap().id(w1)
        # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD
        assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
        if language(w1) is None:
            print >> sys.stderr, "Skipping %s" % ` wordmap().str(w1) `
            continue
        if w1 not in targetmap(name="reverse"):
            print >> sys.stderr, "Skipping %s, not a source word in targetmap" % ` wordmap(
            ).str(w1) `
    state.act = ACT
    state.depth = DEPTH
    state.depthbegin = depthbegin
    state.n_hid = N_HID
    state.noise = NOISE
    state.activation_regularization_coeff = ACTIVATION_REGULARIZATION_COEFF
    state.weight_regularization_coeff = WEIGHT_REGULARIZATION_COEFF
    state.nepochs = NEPOCHS
    state.LR = LR
    state.noise_lvl = NOISE_LVL
    state.epochstest = EPOCHSTEST
    channel.save()


    for depth in xrange(depthbegin,DEPTH):
        print >> sys.stderr, 'BEGIN DEPTH %s...' % (percent(depth+1, DEPTH))
        print >> sys.stderr, stats()
        if depth == 0:
            n_aux = NINPUTS
        else:
            n_aux = model.layers[depth-1].n_out
        if depth==0 and INPUTTYPE == 'tfidf':
            model.depth_max = model.depth_max+1
            model.reconstruction_cost = 'quadratic'
            model.reconstruction_cost_fn = quadratic_cost
            model.auxiliary(init=1,auxact='softplus',auxdepth=-DEPTH+depth+1, auxn_out=n_aux)
        else:
            model.depth_max = model.depth_max+1
            if depth == 0 or ACT[depth-1] != 'rectifier':
                model.reconstruction_cost = 'cross_entropy'
                model.reconstruction_cost_fn = cross_entropy_cost
print >> sys.stderr, "Writing to %s" % `outfilenames`

for f in filenames: assert os.path.exists(f)
for f in outfilenames:
    if os.path.exists(f):
        print >> sys.stderr, "Warning, going to overwrite %s" % f

#print "Sleeping for 10 seconds..."
#import time
#time.sleep(10)

inf = [open(f) for f in filenames]
outf = [open(f, "wt") for f in outfilenames]

tot = 0
cnt = 0
for lines in izip(*inf):
    tot += 1
    keep = False
    for w in string.split(lines[0]):
        if lemmatize("en", w) in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
            keep = True
            break
    if keep:
        cnt += 1
        for l, f in izip(lines, outf):
            f.write(l)
    if tot % 10000 == 0:
        print >> sys.stderr, "%s lines kept" % percent(cnt, tot)
        print >> sys.stderr, stats()
    channel.save()

    err = dict([(trainsize, {}) for trainsize in VALIDATION_TRAININGSIZE])
    rebuildunsup(model,LR,NOISE_LVL,ACTIVATION_REGULARIZATION_COEFF, WEIGHT_REGULARIZATION_COEFF, BATCHSIZE,train)

    epoch = 0
    if epoch in EPOCHSTEST:
        svm_validation(err, epoch, model,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA, NAME_DATATEST)
        channel.save()

    train_reconstruction_error_mvgavg = MovingAverage()
    for epoch in xrange(1,NEPOCHS+1):
        time1 = time.time()
        state.currentepoch = epoch
        for filenb in xrange(1,NB_FILES + 1):
            print >> sys.stderr, "\t\tAbout to read file %s..." % percent(filenb, NB_FILES)
            print >> sys.stderr, "\t\t", stats()
#                initial_file_time = time.time()
            f =open(PATH_DATA + NAME_DATA +'_%s.pkl'%filenb,'r')
            object = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX)
            print >> sys.stderr, "\t\t...read file %s" % percent(filenb, NB_FILES)
            print >> sys.stderr, "\t\t", stats()
            # The last training file is not of the same shape as the other training files.
            # So, to avoid a GPU memory error, we want to make sure it is the same size.
            # In which case, we pad the matrix but keep track of how many n (instances) there actually are.
            # TODO: Also want to pad trainl
            if object.shape == normalshape:
                train.container.value[:] = object
                currentn = normalshape[0]
                del object
            else:
Exemple #19
0
    state.act = ACT
    state.depth = DEPTH
    state.depthbegin = depthbegin
    state.n_hid = N_HID
    state.noise = NOISE
    state.activation_regularization_coeff = ACTIVATION_REGULARIZATION_COEFF
    state.weight_regularization_coeff = WEIGHT_REGULARIZATION_COEFF
    state.nepochs = NEPOCHS
    state.LR = LR
    state.noise_lvl = NOISE_LVL
    state.epochstest = EPOCHSTEST
    channel.save()


    for depth in xrange(depthbegin,DEPTH):
        print >> sys.stderr, 'BEGIN DEPTH %s...' % (percent(depth+1, DEPTH))
        print >> sys.stderr, stats()
        if depth == 0:
            n_aux = NINPUTS
        else:
            n_aux = model.layers[depth-1].n_out
        if depth==0 and INPUTTYPE == 'tfidf':
            model.depth_max = model.depth_max+1
            model.reconstruction_cost = 'quadratic'
            model.reconstruction_cost_fn = quadratic_cost
            model.auxiliary(init=1,auxact='softplus',auxdepth=-DEPTH+depth+1, auxn_out=n_aux)
        else:
            model.depth_max = model.depth_max+1
            if depth == 0 or ACT[depth-1] != 'rectifier':
                model.reconstruction_cost = 'cross_entropy'
                model.reconstruction_cost_fn = cross_entropy_cost
    # create an index called 'index-dir' in a temp directory
    #    indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'),
    #                            'index-dir')
    #    indexDir = "/Tmp/REMOVEME.index-dir"
    indexDir = "lucene.ukwac"
    dir = SimpleFSDirectory(File(indexDir))
    analyzer = StandardAnalyzer(Version.LUCENE_30)
    queryparser = QueryParser(Version.LUCENE_30, "text", analyzer)
    searcher = IndexSearcher(dir)

    nonzeros = 0

    for i, l in enumerate(sys.stdin):
        if i % 100 == 0:
            print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % (
                i, percent(nonzeros, BLOOM_FILTER_SIZE))
            print >> sys.stderr, stats()
        l = string.strip(l)

        added_this_sentence = 0
        for newl in retrieve(l, searcher, queryparser):
            # Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents
            if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT:
                break

            newl = string.strip(newl)

            # Hash the sentence
            idx = murmur.string_hash(newl.encode("utf-8")) % BLOOM_FILTER_SIZE
            # Don't use duplicate sentences
            if usedsentences[idx]: continue
    from targetvocabulary import targetmap

    for w1 in wordmap().all:
        w1 = wordmap().id(w1)
        # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD
        assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
        if language(w1) is None:
            print >> sys.stderr, "Skipping %s" % `wordmap().str(w1)`
            continue
        if w1 not in targetmap():
            print >> sys.stderr, "Skipping %s, not a source word in targetmap" % `wordmap().str(w1)`
            continue
        for l2 in targetmap()[w1]:
            totcnt = 0
            for cnt, w2 in dictsort(targetmap()[w1][l2]): totcnt += cnt
            print wordmap().str(w1), l2, [(percent(cnt, totcnt), wordform(w2)) for cnt, w2 in dictsort(targetmap()[w1][l2])]

    print >> sys.stderr, "REVERSE MAP NOW"

    for w1 in wordmap().all:
        w1 = wordmap().id(w1)
        # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD
        assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"]
        if language(w1) is None:
            print >> sys.stderr, "Skipping %s" % `wordmap().str(w1)`
            continue
        if w1 not in targetmap(name="reverse"):
            print >> sys.stderr, "Skipping %s, not a source word in targetmap" % `wordmap().str(w1)`
            continue
        for l2 in targetmap(name="reverse")[w1]:
            totcnt = 0
                      "--seed",
                      dest="seed",
                      default=0,
                      type="int",
                      help="random seed")
    (options, args) = parser.parse_args()

    import random
    random.seed(options.seed)
    numpy.random.seed(options.seed)

    assert len(args) > 0  # You need to pass in pkl files to project.

    for i, f in enumerate(args):
        print >> sys.stderr, "\nLoading %s (file %s)..." % (
            f, percent(i + 1, len(args)))
        print >> sys.stderr, stats()
        x = cPickle.load(open(f, "rb"))
        print >> sys.stderr, "...loading %s (file %s)" % (
            f, percent(i + 1, len(args)))
        print >> sys.stderr, stats()

        assert x.ndim == 2
        print >> sys.stderr, "Read instance matrix with shape %s, creating projection with shape %s" % (
            x.shape, (x.shape[0], options.dimensions))

        newx = project(x,
                       dimensions=options.dimensions,
                       seed=options.seed,
                       randomization_type=RANDOMIZATION_TYPE,
                       mode=MODE)
Exemple #23
0
def onlineproject(x, dimensions, seed, randomization_type):
    # Online (low-memory) random projection
    
    newx = numpy.zeros((x.shape[0], dimensions))

    nonzeros = x.nonzero()      # (list of rows, list of cols) of all nonzeros
    
    # (col, row) of all nonzeros
    # We reorder like this so that we can group all columns together, and look up the randomrow for each column feature only once.
    nonzero_colrow = [(nonzeros[1][l], nonzeros[0][l]) for l in range(len(nonzeros[0]))]
    nonzero_colrow.sort()
    nonzero_colrow.reverse()
    
    randrow_key = None
    randrow_values = None
    randrows_computed = 0
    for l, (col, row) in enumerate(nonzero_colrow):
        if randrow_key != col:
            randrow_key = col
            randrow_values = pyrandomprojection.randomrow(key=col, dimensions=dimensions, RANDOMIZATION_TYPE=randomization_type, RANDOM_SEED=seed)

            randrows_computed += 1
            if randrows_computed % 500 == 0:
                print >> sys.stderr, "Retrieved %s random rows thus far, done with %s of nonzeroes on %s..." % (percent(randrows_computed, x.shape[1]), percent(l+1, len(nonzero_colrow)), f)
                print >> sys.stderr, stats()
        newrow = x[row,col] * randrow_values
        assert newx[row].shape == newrow.shape
        newx[row] += newrow
#        if (l+1) % 10000 == 0:
#            print >> sys.stderr, "Done with %s of nonzeroes on %s..." % (percent(l+1, len(nonzero_colrow)), f)
#            print >> sys.stderr, stats()
    return newx
Exemple #24
0
if __name__ == "__main__":
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("-d", "--dimensions", dest="dimensions", default=1000, type="int", help="number of dimensions in random output")
    parser.add_option("-s", "--seed", dest="seed", default=0, type="int", help="random seed")
    (options, args) = parser.parse_args()

    import random
    random.seed(options.seed)
    numpy.random.seed(options.seed)

    assert len(args) > 0    # You need to pass in pkl files to project.

    for i, f in enumerate(args):
        print >> sys.stderr, "\nLoading %s (file %s)..." % (f, percent(i+1, len(args)))
        print >> sys.stderr, stats()
        x = cPickle.load(open(f, "rb"))
        print >> sys.stderr, "...loading %s (file %s)" % (f, percent(i+1, len(args)))
        print >> sys.stderr, stats()

        assert x.ndim == 2
        print >> sys.stderr, "Read instance matrix with shape %s, creating projection with shape %s" % (x.shape, (x.shape[0], options.dimensions))

        newx = project(x, dimensions=options.dimensions, seed=options.seed, randomization_type=RANDOMIZATION_TYPE, mode=MODE)
        assert newx.shape == (x.shape[0], options.dimensions)

        if SCALE_BEFORE_SQUASH == None:
            SCALE_BEFORE_SQUASH = 1. / newx.std()
            print  >> sys.stderr, "Setting SCALE_BEFORE_SQUASH to %f on the basis of %s" % (SCALE_BEFORE_SQUASH, f)
                 WEIGHT_REGULARIZATION_COEFF, BATCHSIZE, train)

    epoch = 0
    if epoch in EPOCHSTEST:
        svm_validation(err, epoch, model, train, datatrain, datatrainsave,
                       datatest, datatestsave, VALIDATION_TRAININGSIZE,
                       VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE,
                       PATH_DATA, NAME_DATATEST)
        channel.save()

    train_reconstruction_error_mvgavg = MovingAverage()
    for epoch in xrange(1, NEPOCHS + 1):
        time1 = time.time()
        state.currentepoch = epoch
        for filenb in xrange(1, NB_FILES + 1):
            print >> sys.stderr, "\t\tAbout to read file %s..." % percent(
                filenb, NB_FILES)
            print >> sys.stderr, "\t\t", stats()
            #                initial_file_time = time.time()
            f = open(PATH_DATA + NAME_DATA + '_%s.pkl' % filenb, 'r')
            object = numpy.asarray(cPickle.load(f), dtype=theano.config.floatX)
            print >> sys.stderr, "\t\t...read file %s" % percent(
                filenb, NB_FILES)
            print >> sys.stderr, "\t\t", stats()
            # The last training file is not of the same shape as the other training files.
            # So, to avoid a GPU memory error, we want to make sure it is the same size.
            # In which case, we pad the matrix but keep track of how many n (instances) there actually are.
            # TODO: Also want to pad trainl
            if object.shape == normalshape:
                train.container.value[:] = object
                currentn = normalshape[0]
                del object
Exemple #26
0
for f in filenames:
    assert os.path.exists(f)
for f in outfilenames:
    if os.path.exists(f):
        print >> sys.stderr, "Warning, going to overwrite %s" % f

#print "Sleeping for 10 seconds..."
#import time
#time.sleep(10)

inf = [open(f) for f in filenames]
outf = [open(f, "wt") for f in outfilenames]

tot = 0
cnt = 0
for lines in izip(*inf):
    tot += 1
    keep = False
    for w in string.split(lines[0]):
        if lemmatize("en", w) in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
            keep = True
            break
    if keep:
        cnt += 1
        for l, f in izip(lines, outf):
            f.write(l)
    if tot % 10000 == 0:
        print >> sys.stderr, "%s lines kept" % percent(cnt, tot)
        print >> sys.stderr, stats()