def decode(): with tf.Session() as sess: # Load vocabularies. vocab_file = FLAGS.data_dir + "/vocab.pkl" word2id = pkl.load(open(vocab_file, "rb")) id2word = {v: k for (k, v) in word2id.items()} embeddings = embedding.Embedding(None, word2id, id2word, word2id["UNK"], word2id["PAD"], word2id["</s>"], word2id["<s>"]) # Create model and load parameters. FLAGS.batch_size = 1 # We decode one sentence at a time. model = create_model(sess, True, len(word2id)) # Decode from standard input. sys.stdout.write("> ") sys.stdout.flush() sentence = sys.stdin.readline() while sentence: encoder_inputs, decoder_inputs, target_weights, bucket_id = utils.prepare_input_sent( sentence, embeddings, _buckets) # Get output logits for the sentence. _, _, output_logits = model.step( sess, np.array([encoder_inputs]).transpose(), np.array([decoder_inputs]).transpose(), np.array([target_weights]).transpose(), bucket_id, True) print(utils.process_output(output_logits, embeddings)) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def propagate(model=None, positive_seed=None, negative_seed=None, name="Unknown"): print("[INFO] Model name:", name) return (pi.random_walk( embedding.Embedding(model.wv.vectors, list(model.wv.vocab.keys())), positive_seed, negative_seed))
def load_data(corpus_file, word2id, max_sent=0): """ Given a dataset file and word2id embeddings, read them to lists :param feature_label_file: :param max_sent: :param word2id: :return: """ end_id = word2id["</s>"] PAD_id = word2id["PAD"] UNK_id = word2id["UNK"] start_id = word2id["<s>"] id2word = {v: k for (k, v) in word2id.items()} word_embedding = embedding.Embedding(None, word2id, id2word, UNK_id, PAD_id, end_id, start_id) # load features and labels feature_vectors = [] sentences = [] labels = [] with codecs.open(corpus_file, "r", "utf8", "replace") as data: i = 0 for line in data: if i >= max_sent and max_sent > 0: break stripped = line.strip() tokens = stripped.split() vector = word_embedding.encode(tokens) vector.append(end_id) # add </s> to sentence sentences.append(tokens) if i == 0: # no previous dialogue available feature_vectors.append(vector) else: # input is previous sentence feature_vectors.append(labels[i - 1]) vector = [start_id] + vector # re-pend start_id to decoder inputs labels.append(vector) i += 1 logging.info("Loaded %d sentences" % len(feature_vectors)) return feature_vectors, sentences, labels
def __init__(self, data, dim, tau, grid_params): self.pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) _embedding = embedding.Embedding(data) self.embedded = _embedding.embedding(tau=tau, m=dim) # transform to features and labels self.features = [] self.labels = [] for i, vector in enumerate(self.embedded): if (i + 1) >= len(self.embedded): break self.features.append(vector) self.labels.append(self.embedded[i + 1]) self.features = np.array(self.features) self.labels = np.array(self.labels) assert isinstance(grid_params, dict), 'grid_params must be dict' self.grid_params = grid_params
def load_embedding(pkl_file): word2id = {} id2word = {} with codecs.open(pkl_file, "rb", "utf8", "replace") as opened: words, vectors = pkl.load(opened) assert len(words) == len(vectors) UNK_id = words.index("<UNK>") PAD_id = words.index("<PAD>") start_id = words.index("<S>") end_id = words.index("</S>") word2id["<s>"] = start_id word2id["</s>"] = end_id for i, w in enumerate(words): word2id[w] = i id2word[i] = w logging.info("Loaded embeddings for %d words with dimensionality %d" % (len(words), len(vectors[0]))) #print "Special tokens:", UNK_id, PAD_id, start_id, end_id emb = embedding.Embedding(vectors, word2id, id2word, UNK_id, PAD_id, end_id, start_id) return emb
print(sys.argv[0]) sys.path.append('../..') import torch import torch.backends.cudnn as cudnn device = "cuda" if torch.cuda.is_available() else "cpu" if device == "cuda": torch.cuda.empty_cache() cudnn.benchmark = True import segsemdata import embedding import numpy as np print("load model") net = embedding.Embedding(pretrained="/data/vgg16-00b39a1b.pth") net = net.to(device) print("load data") datatrain = segsemdata.makeDFC2015(datasetpath="/data/DFC2015", lod0=False, dataflag="train") datatrain = datatrain.copyTOcache(outputresolution=50) net.adddataset(datatrain.metadata()) net = net.to(device) nbclasses = len(datatrain.setofcolors) earlystopping = datatrain.getrandomtiles(1000, 128, 16) print("train setting") import torch.nn as nn
#!/usr/bin/env python import torch import numpy as np import pandas as pd import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt import seaborn as sns import embedding sns.set(style="whitegrid", color_codes=True) ref = embedding.Embedding(gpu=False) ref.load_vectors("output/pi.1000.txt") ref.embedding /= ref.embedding.norm(2, 0).expand_as(ref.embedding) dim = ref.embedding.shape[1] method = { "Power Iteration": "pi", # "Power Iteration with Momentum": "pim" } l1 = {} # First component loss l2 = {} # Second component loss lw = {} # Worst component loss for m in method: it = [i + 1 for i in range(1000)]
# print(x) # print() if epoch % count_loss == 0: q, a, q_len, a_len = self.get_validation(input, output) accuracy, result, targets = sess.run([ self.accuracy, self.pred_labels_sliced, self.decoder_train_targets ], feed_dict=self._dict( q, a, q_len, a_len)) print("loss :\t", loss) print("accuracy :\t", accuracy) print(result) print(a) print(targets) print() if __name__ == "__main__": print(random.randint(0, 9)) embedding = emb.Embedding(trainable=True) embedding.load("./tmp_vectors.txt") embedding.init() # make a seq2seq model, with embeddings loaded from tmp_vector_file model = Seq2Seq(embedding, 100) # 3 questions and answers: 1- Q: salam khoobi => A: mersi khoobam, 2- Q: che khabar => A: salamati .... model.train( [["salam", "khoobi"], ["che", "khabar"], ["aya", "hava", "sarde"]], [["mersi", "khoobam"], ["salamati"], ["are", "fekr", "konam"]])
if len(sys.argv) > 1: env = sys.argv[1] else: env = "local" # print(labels) print("Total labels: ", len(config.labels)) print(config.vocabulary_size) path = "" if env == "local": path = "data/reuters/" elif env == "server": path = "data/reuters/" cnn = cn.Embedding() # Construct model pred = cnn.network(cnn.x, cnn.weights, cnn.biases, cnn.dropout) # Define loss and optimizer #cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=pred, labels=cnn.y)) #cost = tf.reduce_mean(bpmll_out_module.bp_mll(pred, cnn.y)) cost = -tf.reduce_sum( ((cnn.y * tf.log(pred + 1e-9)) + ((1 - cnn.y) * tf.log(1 - pred + 1e-9))), name='xentropy') + 0.01 * (tf.nn.l2_loss(cnn.weights['wd1']) + tf.nn.l2_loss(cnn.weights['out'])) optimizer = tf.train.AdamOptimizer( learning_rate=cnn.learning_rate).minimize(cost) # Evaluate model correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(cnn.y, 1))
'/Users/pengxiang/corpora/spaces/enwiki-20160901/dim300vecs.bin.gz', True) levy_deps = embedding.Embedding('levy_deps', 300, syntax_label=False) levy_deps.load_model( '/Users/pengxiang/corpora/spaces/levy_deps', False) pair_triple = embedding.Embedding( 'event_based_pair_triple', 300, syntax_label=True) pair_triple.load_model( '/Users/pengxiang/corpora/spaces/enwiki-20160901/event_based/' 'dim300vecs_w_surface_pair_c_lemma_triple', False) ''' event_model = embedding.Embedding('event_script', 300, syntax_label=True, use_ner=True, use_lemma=True, include_compounds=True) event_model.load_model( '/Users/pengxiang/corpora/spaces/03141230_dim300vecs.bin', True) most_sim_event_eval = MostSimEventEvaluator() most_sim_event_eval.set_use_max_score(True) most_sim_event_eval.set_rep_only(True) most_sim_event_eval.set_head_only(True) ''' most_sim_event_eval.set_model(word2vec) most_sim_event_eval.evaluate(all_scripts) most_sim_event_eval.set_model(levy_deps)
import copy import numpy as np import pandas as pd import embedding import classifiers pp = PurePath(Path.cwd()).parts pdir = PurePath(*pp) bid, ask = pd.read_csv(str(pdir) + '/data/eurusd-bid-1h.csv'), pd.read_csv( str(pdir) + '/data/eurusd-ask-1h.csv') mids = ((bid.iloc[:, 1] + ask.iloc[:, 1]) / 2).dropna() _embedding = embedding.Embedding(mids) time_delayed_mi = _embedding.time_delayed_mutual_information() # _embedding.plot_mutual_information(time_delayed_mi) # First minima of time-delayed mutual information time_delay = _embedding.locmin(time_delayed_mi)[0] # _embedding.plot_delayed_series(tau=time_delay) # Calculate FNN in the range of 10 dimensions. Takes some time to calculate! # dim = np.arange(1, 10 + 1) # f1, f2, f3 = _embedding.fnn(mids.values, dim=dim, tau=time_delay, window=10, metric='cityblock') # _embedding.plot_fnn(dim, f1, f2, f3) # judging from the plot above FNN goes beyond 10% in dim=4 m = 4
print(sys.argv) sys.path.append('../..') import torch import torch.backends.cudnn as cudnn device = "cuda" if torch.cuda.is_available() else "cpu" if device == "cuda": torch.cuda.empty_cache() cudnn.benchmark = True import segsemdata import embedding import numpy as np print("load model") net = embedding.Embedding(pretrained="/home/achanhon/vgg16-00b39a1b.pth") net = net.to(device) print("load data") datatrain = segsemdata.makeTinyMiniFrancePerTown( datasetpath="/data01/PUBLIC_DATASETS/MiniFrance/tmFrance/", town="all", dataflag="train") if len(sys.argv) == 1 and sys.argv == "grey": datatrain = datatrain.copyTOcache(color=False) if len(sys.argv) == 1 and sys.argv == "normalize": datatrain = datatrain.copyTOcache(color=False, normalize=True) net.adddataset(datatrain.metadata()) net = net.to(device)
def train(): with tf.Session() as sess: # Read data into buckets and compute their sizes. print("Reading training data (limit: %d)." % FLAGS.max_train_data_size) # load data and embeddings train_file = FLAGS.data_dir + "/corpus.txt" vocab_file = FLAGS.data_dir + "/vocab.pkl" word2id = pkl.load(open(vocab_file, "rb")) id2word = {v: k for (k, v) in word2id.items()} embeddings = embedding.Embedding(None, word2id, id2word, word2id["UNK"], word2id["PAD"], word2id["</s>"], word2id["<s>"]) vocab_size = len(word2id) train_feature_vectors, train_sentences, train_labels = \ utils.load_data(train_file, word2id, max_sent=FLAGS.max_train_data_size) print("vocab size: %d" % vocab_size) print("Training on %d instances" % len(train_labels)) print("Maximum sentence length (train): %d" % max([len(y) for y in train_labels])) print("Average sentence length (train): %d" % np.mean([len(y) for y in train_labels])) # bucketing training data # equal bucket sizes #buckets = [(5, 10), (10, 15), (20, 25), (40, 50)] #pre-define buckets data_buckets, reordering_indexes = utils.put_in_double_buckets( np.asarray(train_feature_vectors), np.asarray(train_labels), _buckets, embeddings.PAD_id) bucket_sizes = [0] * len(_buckets) for i, indx in reordering_indexes.items(): bucket_sizes[i] = len(indx) print("Bucket sizes: %s" % str(bucket_sizes)) # A bucket scale is a list of increasing numbers from 0 to 1 that we'll use # to select a bucket. Length of [scale[i], scale[i+1]] is proportional to # the size if i-th training bucket, as used later. buckets_scale = [ sum(bucket_sizes[:i + 1]) / len(train_labels) for i in xrange(len(bucket_sizes)) ] print("Bucket scale: %s" % str(buckets_scale)) # Create model. print("Creating %d layers of %d units." % (FLAGS.num_layers, FLAGS.size)) model = create_model(sess, False, vocab_size) # This is the training loop. step_time, loss = 0.0, 0.0 current_step = 0 previous_losses = [] while True: # Choose a bucket according to the number of samples within probs = np.array(buckets_scale) / sum(buckets_scale) bucket_id = np.random.choice(range(len(buckets_scale)), p=probs) #print("Bucket %d" % bucket_id) # Get a batch and make a step. start_time = time.time() bucket_xs, bucket_ys, input_lens, output_lens, bucket_masks = data_buckets[ bucket_id] # random order of samples in batch order = np.random.permutation(len(bucket_xs)) batch_samples = order[:FLAGS.batch_size] #print("Batch samples: %s" % str(batch_samples)) # get a batch from this bucket encoder_inputs = bucket_xs[batch_samples] # TODO reverse inputs? decoder_inputs = bucket_ys[batch_samples] target_weights = bucket_masks[batch_samples] #print(encoder_inputs.shape, decoder_inputs.shape, target_weights.shape) # batch x seq_len -> transpose as input _, step_loss, _ = model.step(sess, encoder_inputs.transpose(), decoder_inputs.transpose(), target_weights.transpose(), bucket_id, False) step_time += (time.time() - start_time) / FLAGS.steps_per_checkpoint loss += step_loss / FLAGS.steps_per_checkpoint current_step += 1 # Once in a while, we save checkpoint, print statistics, and run evals. if current_step % FLAGS.steps_per_checkpoint == 0: # Print statistics for the previous epoch. perplexity = math.exp( float(loss)) if loss < 300 else float("inf") print( "global step %d learning rate %.4f step-time %.2f perplexity " "%.2f" % (model.global_step.eval(), model.learning_rate.eval(), step_time, perplexity)) previous_losses.append(loss) # Save checkpoint and zero timer and loss. checkpoint_path = os.path.join(FLAGS.model_dir, "translate.ckpt") model.saver.save(sess, checkpoint_path, global_step=model.global_step) step_time, loss = 0.0, 0.0
EMOJIS = api_call.get('emoji').keys() print EMOJIS # load the tf model with tf.Session() as sess: # Load slack metadata metadata = None with open("metadata.json", "r") as m: metadata = json.load(m) # Load vocabularies. vocab_file = FLAGS.data_dir + "/vocab.pkl" word2id = pkl.load(open(vocab_file, "rb")) id2word = {v: k for (k, v) in word2id.items()} embeddings = embedding.Embedding(None, word2id, id2word, word2id["UNK"], word2id["PAD"], word2id["</s>"], word2id["<s>"]) # Create model and load parameters. model = create_model(sess, True, len(word2id)) if slack_client.rtm_connect(): print "%s running: id %s, token %s" % (BOT_NAME, BOT_ID, TOKEN) while True: command, channel = parse_slack_output(slack_client.rtm_read()) if command and channel: handle_command(command, channel, model, embeddings, metadata) time.sleep(READ_WEBSOCKET_DELAY) else: print "%s failed" % BOT_NAME