def generate_context_vectors(): """ Generate the (random) context vectors. """ HYPERPARAMETERS = common.hyperparameters.read("random-indexing") from vocabulary import wordmap if HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "gaussian": context_vectors = [numpy.random.normal(size=(wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"])) for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"]))] elif HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "ternary": NONZEROS = int(HYPERPARAMETERS["TERNARY_NON_ZERO_PERCENT"] * HYPERPARAMETERS["REPRESENTATION_SIZE"] + 0.5) logging.info("Generating %d nonzeros per %d-length random context vector" % (NONZEROS, HYPERPARAMETERS["REPRESENTATION_SIZE"])) # Generate one set of context vectors per list in HYPERPARAMETERS["CONTEXT_TYPES"] context_vectors = [] for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"])): logging.info("Generated %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"])))) logging.info(stats()) thiscontext = numpy.zeros((wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"])) for j in range(wordmap.len): idxs = range(HYPERPARAMETERS["REPRESENTATION_SIZE"]) random.shuffle(idxs) for k in idxs[:NONZEROS]: thiscontext[j][k] = random.choice([-1, +1]) # print thiscontext[j] context_vectors.append(thiscontext) else: assert 0 logging.info("Done generating %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"])))) logging.info(stats()) return context_vectors
def bicorpus_sentences_and_alignments(l1, l2, f1, f2, falign): """ Given languages l1 and l2 and their bicorpus filenames f1, f2, and falign, yield tuples of the former (ws1, ws2, links), where ws1 are the word ids in the sentence from f1, where ws2 are the word ids in the sentence from f2, and links is a list of (i1, i2) word indexes that are linked. """ from w2w.vocabulary import wordmap i = 0 emptycnt = 0 logging.info("Reading %s,%s sentences and alignments from %s, %s, %s" % (l1, l2, f1, f2, falign)) fil1, fil2, filalign = open(f1), open(f2), open(falign) for (s1, s2, salign) in itertools.izip(fil1, fil2, filalign): # print s1, s2, salign, i += 1 if i % 100000 == 0: logging.info("\tRead line %d of %s, %s, %s..." % (i, f1, f2, falign)) logging.info("\tEmpty sentences are %s..." % (percent(emptycnt, i))) logging.info("\t%s" % stats()) ws1 = [(l1, w1) for w1 in string.split(s1)] ws2 = [(l2, w2) for w2 in string.split(s2)] ws1 = [wordmap().id(tok) for tok in ws1] ws2 = [wordmap().id(tok) for tok in ws2] if len(ws1) == 0 or len(ws2) == 0: emptycnt += 1 continue # print ws2, [w2w.vocabulary.wordmap.str(w2) for w2 in ws2] links = [string.split(link, sep="-") for link in string.split(salign)] links = [(int(i1), int(i2)) for i1, i2 in links] yield ws1, ws2, links # Make sure all iterators are exhausted alldone = 0 try: value = fil1.next() except StopIteration: alldone += 1 try: value = fil2.next() except StopIteration: alldone += 1 try: value = filalign.next() except StopIteration: alldone += 1 assert alldone == 3 logging.info("DONE. Read line %d of %s, %s, %s..." % (i, f1, f2, falign)) logging.info("Empty sentences are %s..." % (percent(emptycnt, i))) logging.info(stats())
def print_aggregate(cnts): for k in cnts: print k tot = 0 for v, k2 in dictsort(cnts[k]): tot += v for v, k2 in dictsort(cnts[k]): print "\t", percent(v, tot), k2
def generate_context_vectors(): """ Generate the (random) context vectors. """ HYPERPARAMETERS = common.hyperparameters.read("random-indexing") from vocabulary import wordmap if HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "gaussian": context_vectors = [ numpy.random.normal(size=(wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"])) for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"])) ] elif HYPERPARAMETERS["RANDOMIZATION_TYPE"] == "ternary": NONZEROS = int(HYPERPARAMETERS["TERNARY_NON_ZERO_PERCENT"] * HYPERPARAMETERS["REPRESENTATION_SIZE"] + 0.5) logging.info( "Generating %d nonzeros per %d-length random context vector" % (NONZEROS, HYPERPARAMETERS["REPRESENTATION_SIZE"])) # Generate one set of context vectors per list in HYPERPARAMETERS["CONTEXT_TYPES"] context_vectors = [] for i in range(len(HYPERPARAMETERS["CONTEXT_TYPES"])): logging.info("Generated %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"])))) logging.info(stats()) thiscontext = numpy.zeros( (wordmap.len, HYPERPARAMETERS["REPRESENTATION_SIZE"])) for j in range(wordmap.len): idxs = range(HYPERPARAMETERS["REPRESENTATION_SIZE"]) random.shuffle(idxs) for k in idxs[:NONZEROS]: thiscontext[j][k] = random.choice([-1, +1]) # print thiscontext[j] context_vectors.append(thiscontext) else: assert 0 logging.info("Done generating %s context matrixes" % (percent(i, len(HYPERPARAMETERS["CONTEXT_TYPES"])))) logging.info(stats()) return context_vectors
def embeddings_debug(w, cnt, str): """ Output the l2norm mean and max of the embeddings, including in debug out the str and training cnt """ totalcnt = numpy.sum(numpy.abs(w) >= 0) notsmallcnt = numpy.sum(numpy.abs(w) >= 0.1) logging.info("%d %s dimensions of %s have absolute value >= 0.1" % (cnt, percent(notsmallcnt, totalcnt), str)) notsmallcnt = numpy.sum(numpy.abs(w) >= 0.01) logging.info("%d %s dimensions of %s have absolute value >= 0.01" % (cnt, percent(notsmallcnt, totalcnt), str)) l2norm = numpy.sqrt(numpy.square(w).sum(axis=1)) median = numpy.median(l2norm) mean = numpy.mean(l2norm) std = numpy.std(l2norm) # print("%d l2norm of top 100 words: mean = %f stddev=%f" % (cnt, numpy.mean(l2norm), numpy.std(l2norm),)) l2norm = l2norm.tolist() l2norm.sort() l2norm.reverse() logging.info("%d l2norm of %s: median = %f mean = %f stddev=%f top3=%s" % (cnt, str, median, mean, std, `l2norm[:3]`))
def onlineproject(x, dimensions, seed, randomization_type): # Online (low-memory) random projection newx = numpy.zeros((x.shape[0], dimensions)) nonzeros = x.nonzero() # (list of rows, list of cols) of all nonzeros # (col, row) of all nonzeros # We reorder like this so that we can group all columns together, and look up the randomrow for each column feature only once. nonzero_colrow = [(nonzeros[1][l], nonzeros[0][l]) for l in range(len(nonzeros[0]))] nonzero_colrow.sort() nonzero_colrow.reverse() randrow_key = None randrow_values = None randrows_computed = 0 for l, (col, row) in enumerate(nonzero_colrow): if randrow_key != col: randrow_key = col randrow_values = pyrandomprojection.randomrow( key=col, dimensions=dimensions, RANDOMIZATION_TYPE=randomization_type, RANDOM_SEED=seed) randrows_computed += 1 if randrows_computed % 500 == 0: print >> sys.stderr, "Retrieved %s random rows thus far, done with %s of nonzeroes on %s..." % ( percent(randrows_computed, x.shape[1]), percent(l + 1, len(nonzero_colrow)), f) print >> sys.stderr, stats() newrow = x[row, col] * randrow_values assert newx[row].shape == newrow.shape newx[row] += newrow # if (l+1) % 10000 == 0: # print >> sys.stderr, "Done with %s of nonzeroes on %s..." % (percent(l+1, len(nonzero_colrow)), f) # print >> sys.stderr, stats() return newx
def print_aggregate_compare(cnts, cntsmore): """ Compare the hyperparams in the TOP jobs to the hyperparams in the MORE jobs. """ cntscopy = copy.deepcopy(cnts) for k in cnts: print k for k2 in cnts[k].keys(): cntscopy[k][k2] = (1. * cnt[k][k2]/cntsmore[k][k2], cnts[k][k2], cntsmore[k][k2]) maxperc = dictsort(cntscopy[k])[0][0][0] for v, k2 in dictsort(cntscopy[k]): # The second column (v[0]/maxperc) is a score for how good this hyperparam is. print "\t", k2, "\t", "%.2f" % (v[0]/maxperc), "\t", percent(v[1], v[2], rev=True)
def print_aggregate_compare(cnts, cntsmore): """ Compare the hyperparams in the TOP jobs to the hyperparams in the MORE jobs. """ cntscopy = copy.deepcopy(cnts) for k in cnts: print k for k2 in cnts[k].keys(): cntscopy[k][k2] = (1. * cnt[k][k2] / cntsmore[k][k2], cnts[k][k2], cntsmore[k][k2]) maxperc = dictsort(cntscopy[k])[0][0][0] for v, k2 in dictsort(cntscopy[k]): # The second column (v[0]/maxperc) is a score for how good this hyperparam is. print "\t", k2, "\t", "%.2f" % (v[0] / maxperc), "\t", percent( v[1], v[2], rev=True)
def validate(translation_model, cnt): import math # logranks = [] # logging.info("BEGINNING VALIDATION AT TRAINING STEP %d" % cnt) # logging.info(stats()) i = 0 tot = 0 correct = 0 for (i, ve) in enumerate(w2w.examples.get_all_validation_examples_cached()): correct_sequences, noise_sequences, weights = ebatch_to_sequences([ve]) source_language = ve.l1 is_correct = translation_model[source_language].validate_errors(correct_sequences, noise_sequences) # print r for w in weights: assert w == 1.0 tot += 1 if is_correct: correct += 1 if i % 1000 == 0: logging.info("\tvalidating %d examples done..." % i) # logging.info("Validation of model %s at cnt %d: validation err %s" % (translation_model[source_language].modelname, cnt, percent(correct, tot))) logging.info("VALIDATION of model at cnt %d: validation accuracy %s" % (cnt, percent(correct, tot)))
print >> sys.stderr, "WEIRD WORD: %s" % word word = string.lower(word) assert len(vals[1:]) == HYPERPARAMETERS["EMBEDDING_SIZE"] tot += 1 if tot % 10000 == 0: print >> sys.stderr, "\tRead %d lines from %s" % ( tot, HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]) if word in original_embeddings: # print >> sys.stderr, "Skipping word %s (originally %s), we already have an embedding for it" % (word, vals[0]) continue else: original_embeddings[word] = numpy.array( [float(v) for v in vals[1:]]) print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS[ "W2W INITIAL EMBEDDINGS"] print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent( tot - len(original_embeddings), tot) print >> sys.stderr, stats() reversemap = targetmap(name="reverse") embeddings = numpy.zeros( (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"])) assert embeddings.shape == (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]) ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"] for w in range(wordmap().len): embedding = None # If this word is in a different language than the embeddings. if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]: if w not in reversemap:
lucene.initVM() # create an index called 'index-dir' in a temp directory # indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), # 'index-dir') # indexDir = "/Tmp/REMOVEME.index-dir" indexDir = "lucene.ukwac" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) queryparser = QueryParser(Version.LUCENE_30, "text", analyzer) searcher = IndexSearcher(dir) nonzeros = 0 for i, l in enumerate(sys.stdin): if i % 100 == 0: print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % (i, percent(nonzeros, BLOOM_FILTER_SIZE)) print >> sys.stderr, stats() l = string.strip(l) added_this_sentence = 0 for newl in retrieve(l, searcher, queryparser): # Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT: break newl = string.strip(newl) # Hash the sentence idx = murmur.string_hash(newl.encode("utf-8")) % BLOOM_FILTER_SIZE # Don't use duplicate sentences if usedsentences[idx]: continue
word = vals[0] if HYPERPARAMETERS["W2W LOWERCASE INITIAL EMBEDDINGS BEFORE INITIALIZATION"] and word != "*UNKNOWN*": if (word[0] == '*' and word[-1] == '*' and len(word) > 1): print >> sys.stderr, "WEIRD WORD: %s" % word word = string.lower(word) assert len(vals[1:]) == HYPERPARAMETERS["EMBEDDING_SIZE"] tot += 1 if tot % 10000 == 0: print >> sys.stderr, "\tRead %d lines from %s" % (tot, HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"]) if word in original_embeddings: # print >> sys.stderr, "Skipping word %s (originally %s), we already have an embedding for it" % (word, vals[0]) continue else: original_embeddings[word] = numpy.array([float(v) for v in vals[1:]]) print >> sys.stderr, "...done reading embeddings from %s" % HYPERPARAMETERS["W2W INITIAL EMBEDDINGS"] print >> sys.stderr, "Skipped %s words for which we had duplicate embeddings" % percent(tot-len(original_embeddings), tot) print >> sys.stderr, stats() reversemap = targetmap(name="reverse") embeddings = numpy.zeros((wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"])) assert embeddings.shape == (wordmap().len, HYPERPARAMETERS["EMBEDDING_SIZE"]) ELANG = HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"] for w in range(wordmap().len): embedding = None # If this word is in a different language than the embeddings. if language(w) != HYPERPARAMETERS["W2W INITIAL EMBEDDINGS LANGUAGE"]: if w not in reversemap: print >> sys.stderr, "Word %s is not even in target map! Using *UNKNOWN*" % `wordmap().str(w)` embedding = original_embeddings["*UNKNOWN*"]
w1 = wordmap().id(w1) # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] if language(w1) is None: print >> sys.stderr, "Skipping %s" % ` wordmap().str(w1) ` continue if w1 not in targetmap(): print >> sys.stderr, "Skipping %s, not a source word in targetmap" % ` wordmap( ).str(w1) ` continue for l2 in targetmap()[w1]: totcnt = 0 for cnt, w2 in dictsort(targetmap()[w1][l2]): totcnt += cnt print wordmap().str(w1), l2, [ (percent(cnt, totcnt), wordform(w2)) for cnt, w2 in dictsort(targetmap()[w1][l2]) ] print >> sys.stderr, "REVERSE MAP NOW" for w1 in wordmap().all: w1 = wordmap().id(w1) # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] if language(w1) is None: print >> sys.stderr, "Skipping %s" % ` wordmap().str(w1) ` continue if w1 not in targetmap(name="reverse"): print >> sys.stderr, "Skipping %s, not a source word in targetmap" % ` wordmap( ).str(w1) `
state.act = ACT state.depth = DEPTH state.depthbegin = depthbegin state.n_hid = N_HID state.noise = NOISE state.activation_regularization_coeff = ACTIVATION_REGULARIZATION_COEFF state.weight_regularization_coeff = WEIGHT_REGULARIZATION_COEFF state.nepochs = NEPOCHS state.LR = LR state.noise_lvl = NOISE_LVL state.epochstest = EPOCHSTEST channel.save() for depth in xrange(depthbegin,DEPTH): print >> sys.stderr, 'BEGIN DEPTH %s...' % (percent(depth+1, DEPTH)) print >> sys.stderr, stats() if depth == 0: n_aux = NINPUTS else: n_aux = model.layers[depth-1].n_out if depth==0 and INPUTTYPE == 'tfidf': model.depth_max = model.depth_max+1 model.reconstruction_cost = 'quadratic' model.reconstruction_cost_fn = quadratic_cost model.auxiliary(init=1,auxact='softplus',auxdepth=-DEPTH+depth+1, auxn_out=n_aux) else: model.depth_max = model.depth_max+1 if depth == 0 or ACT[depth-1] != 'rectifier': model.reconstruction_cost = 'cross_entropy' model.reconstruction_cost_fn = cross_entropy_cost
print >> sys.stderr, "Writing to %s" % `outfilenames` for f in filenames: assert os.path.exists(f) for f in outfilenames: if os.path.exists(f): print >> sys.stderr, "Warning, going to overwrite %s" % f #print "Sleeping for 10 seconds..." #import time #time.sleep(10) inf = [open(f) for f in filenames] outf = [open(f, "wt") for f in outfilenames] tot = 0 cnt = 0 for lines in izip(*inf): tot += 1 keep = False for w in string.split(lines[0]): if lemmatize("en", w) in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: keep = True break if keep: cnt += 1 for l, f in izip(lines, outf): f.write(l) if tot % 10000 == 0: print >> sys.stderr, "%s lines kept" % percent(cnt, tot) print >> sys.stderr, stats()
channel.save() err = dict([(trainsize, {}) for trainsize in VALIDATION_TRAININGSIZE]) rebuildunsup(model,LR,NOISE_LVL,ACTIVATION_REGULARIZATION_COEFF, WEIGHT_REGULARIZATION_COEFF, BATCHSIZE,train) epoch = 0 if epoch in EPOCHSTEST: svm_validation(err, epoch, model,train,datatrain,datatrainsave,datatest,datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA, NAME_DATATEST) channel.save() train_reconstruction_error_mvgavg = MovingAverage() for epoch in xrange(1,NEPOCHS+1): time1 = time.time() state.currentepoch = epoch for filenb in xrange(1,NB_FILES + 1): print >> sys.stderr, "\t\tAbout to read file %s..." % percent(filenb, NB_FILES) print >> sys.stderr, "\t\t", stats() # initial_file_time = time.time() f =open(PATH_DATA + NAME_DATA +'_%s.pkl'%filenb,'r') object = numpy.asarray(cPickle.load(f),dtype=theano.config.floatX) print >> sys.stderr, "\t\t...read file %s" % percent(filenb, NB_FILES) print >> sys.stderr, "\t\t", stats() # The last training file is not of the same shape as the other training files. # So, to avoid a GPU memory error, we want to make sure it is the same size. # In which case, we pad the matrix but keep track of how many n (instances) there actually are. # TODO: Also want to pad trainl if object.shape == normalshape: train.container.value[:] = object currentn = normalshape[0] del object else:
# create an index called 'index-dir' in a temp directory # indexDir = os.path.join(System.getProperty('java.io.tmpdir', 'tmp'), # 'index-dir') # indexDir = "/Tmp/REMOVEME.index-dir" indexDir = "lucene.ukwac" dir = SimpleFSDirectory(File(indexDir)) analyzer = StandardAnalyzer(Version.LUCENE_30) queryparser = QueryParser(Version.LUCENE_30, "text", analyzer) searcher = IndexSearcher(dir) nonzeros = 0 for i, l in enumerate(sys.stdin): if i % 100 == 0: print >> sys.stderr, "Read %d lines from sys.stdin (bloom filter has %s nonzeros)..." % ( i, percent(nonzeros, BLOOM_FILTER_SIZE)) print >> sys.stderr, stats() l = string.strip(l) added_this_sentence = 0 for newl in retrieve(l, searcher, queryparser): # Iterate until we have added DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT documents if added_this_sentence >= DESIRED_NEW_DOCUMENTS_PER_ORIGINAL_DOCUMENT: break newl = string.strip(newl) # Hash the sentence idx = murmur.string_hash(newl.encode("utf-8")) % BLOOM_FILTER_SIZE # Don't use duplicate sentences if usedsentences[idx]: continue
from targetvocabulary import targetmap for w1 in wordmap().all: w1 = wordmap().id(w1) # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] if language(w1) is None: print >> sys.stderr, "Skipping %s" % `wordmap().str(w1)` continue if w1 not in targetmap(): print >> sys.stderr, "Skipping %s, not a source word in targetmap" % `wordmap().str(w1)` continue for l2 in targetmap()[w1]: totcnt = 0 for cnt, w2 in dictsort(targetmap()[w1][l2]): totcnt += cnt print wordmap().str(w1), l2, [(percent(cnt, totcnt), wordform(w2)) for cnt, w2 in dictsort(targetmap()[w1][l2])] print >> sys.stderr, "REVERSE MAP NOW" for w1 in wordmap().all: w1 = wordmap().id(w1) # Actually, should assert W2W SKIP TRANSLATIONS FROM UNKNOWN WORD assert HYPERPARAMETERS["W2W SKIP TRANSLATIONS TO UNKNOWN WORD"] if language(w1) is None: print >> sys.stderr, "Skipping %s" % `wordmap().str(w1)` continue if w1 not in targetmap(name="reverse"): print >> sys.stderr, "Skipping %s, not a source word in targetmap" % `wordmap().str(w1)` continue for l2 in targetmap(name="reverse")[w1]: totcnt = 0
"--seed", dest="seed", default=0, type="int", help="random seed") (options, args) = parser.parse_args() import random random.seed(options.seed) numpy.random.seed(options.seed) assert len(args) > 0 # You need to pass in pkl files to project. for i, f in enumerate(args): print >> sys.stderr, "\nLoading %s (file %s)..." % ( f, percent(i + 1, len(args))) print >> sys.stderr, stats() x = cPickle.load(open(f, "rb")) print >> sys.stderr, "...loading %s (file %s)" % ( f, percent(i + 1, len(args))) print >> sys.stderr, stats() assert x.ndim == 2 print >> sys.stderr, "Read instance matrix with shape %s, creating projection with shape %s" % ( x.shape, (x.shape[0], options.dimensions)) newx = project(x, dimensions=options.dimensions, seed=options.seed, randomization_type=RANDOMIZATION_TYPE, mode=MODE)
def onlineproject(x, dimensions, seed, randomization_type): # Online (low-memory) random projection newx = numpy.zeros((x.shape[0], dimensions)) nonzeros = x.nonzero() # (list of rows, list of cols) of all nonzeros # (col, row) of all nonzeros # We reorder like this so that we can group all columns together, and look up the randomrow for each column feature only once. nonzero_colrow = [(nonzeros[1][l], nonzeros[0][l]) for l in range(len(nonzeros[0]))] nonzero_colrow.sort() nonzero_colrow.reverse() randrow_key = None randrow_values = None randrows_computed = 0 for l, (col, row) in enumerate(nonzero_colrow): if randrow_key != col: randrow_key = col randrow_values = pyrandomprojection.randomrow(key=col, dimensions=dimensions, RANDOMIZATION_TYPE=randomization_type, RANDOM_SEED=seed) randrows_computed += 1 if randrows_computed % 500 == 0: print >> sys.stderr, "Retrieved %s random rows thus far, done with %s of nonzeroes on %s..." % (percent(randrows_computed, x.shape[1]), percent(l+1, len(nonzero_colrow)), f) print >> sys.stderr, stats() newrow = x[row,col] * randrow_values assert newx[row].shape == newrow.shape newx[row] += newrow # if (l+1) % 10000 == 0: # print >> sys.stderr, "Done with %s of nonzeroes on %s..." % (percent(l+1, len(nonzero_colrow)), f) # print >> sys.stderr, stats() return newx
if __name__ == "__main__": from optparse import OptionParser parser = OptionParser() parser.add_option("-d", "--dimensions", dest="dimensions", default=1000, type="int", help="number of dimensions in random output") parser.add_option("-s", "--seed", dest="seed", default=0, type="int", help="random seed") (options, args) = parser.parse_args() import random random.seed(options.seed) numpy.random.seed(options.seed) assert len(args) > 0 # You need to pass in pkl files to project. for i, f in enumerate(args): print >> sys.stderr, "\nLoading %s (file %s)..." % (f, percent(i+1, len(args))) print >> sys.stderr, stats() x = cPickle.load(open(f, "rb")) print >> sys.stderr, "...loading %s (file %s)" % (f, percent(i+1, len(args))) print >> sys.stderr, stats() assert x.ndim == 2 print >> sys.stderr, "Read instance matrix with shape %s, creating projection with shape %s" % (x.shape, (x.shape[0], options.dimensions)) newx = project(x, dimensions=options.dimensions, seed=options.seed, randomization_type=RANDOMIZATION_TYPE, mode=MODE) assert newx.shape == (x.shape[0], options.dimensions) if SCALE_BEFORE_SQUASH == None: SCALE_BEFORE_SQUASH = 1. / newx.std() print >> sys.stderr, "Setting SCALE_BEFORE_SQUASH to %f on the basis of %s" % (SCALE_BEFORE_SQUASH, f)
WEIGHT_REGULARIZATION_COEFF, BATCHSIZE, train) epoch = 0 if epoch in EPOCHSTEST: svm_validation(err, epoch, model, train, datatrain, datatrainsave, datatest, datatestsave, VALIDATION_TRAININGSIZE, VALIDATION_RUNS_FOR_EACH_TRAININGSIZE, PATH_SAVE, PATH_DATA, NAME_DATATEST) channel.save() train_reconstruction_error_mvgavg = MovingAverage() for epoch in xrange(1, NEPOCHS + 1): time1 = time.time() state.currentepoch = epoch for filenb in xrange(1, NB_FILES + 1): print >> sys.stderr, "\t\tAbout to read file %s..." % percent( filenb, NB_FILES) print >> sys.stderr, "\t\t", stats() # initial_file_time = time.time() f = open(PATH_DATA + NAME_DATA + '_%s.pkl' % filenb, 'r') object = numpy.asarray(cPickle.load(f), dtype=theano.config.floatX) print >> sys.stderr, "\t\t...read file %s" % percent( filenb, NB_FILES) print >> sys.stderr, "\t\t", stats() # The last training file is not of the same shape as the other training files. # So, to avoid a GPU memory error, we want to make sure it is the same size. # In which case, we pad the matrix but keep track of how many n (instances) there actually are. # TODO: Also want to pad trainl if object.shape == normalshape: train.container.value[:] = object currentn = normalshape[0] del object
for f in filenames: assert os.path.exists(f) for f in outfilenames: if os.path.exists(f): print >> sys.stderr, "Warning, going to overwrite %s" % f #print "Sleeping for 10 seconds..." #import time #time.sleep(10) inf = [open(f) for f in filenames] outf = [open(f, "wt") for f in outfilenames] tot = 0 cnt = 0 for lines in izip(*inf): tot += 1 keep = False for w in string.split(lines[0]): if lemmatize("en", w) in HYPERPARAMETERS["W2W FOCUS LEMMAS"]: keep = True break if keep: cnt += 1 for l, f in izip(lines, outf): f.write(l) if tot % 10000 == 0: print >> sys.stderr, "%s lines kept" % percent(cnt, tot) print >> sys.stderr, stats()