def _get_data(self): try: d1, self.fmt = utils.loadfile(filename=self.obj1) d2, self.fmt = utils.loadfile(filename=self.obj2) except Exception, e: (retcode, out, err) = utils.runCommand("diff -u %s %s" % (self.obj1, self.obj2)) if out: out = re.sub(r"%s.*" % (self.obj1), self.obj2, out) out = re.sub(r"%s.*" % (self.obj2), self.obj2, out) if err: err = re.sub(r"%s.*" % (self.obj1), self.obj2, err) err = re.sub(r"%s.*" % (self.obj2), self.obj2, err) return 1, out, err
def main(): # Arguments logic parser = argparse.ArgumentParser(description='Web crawler') parser.add_argument('website', nargs='?', action='store', help='website to crawl') parser.add_argument('-l', action='store', default=2, help='maximum depth level to crawl') parser.add_argument('-resume', action='store_const', const=32, help='resume crawler') global crawler_state # Arguments parser args = parser.parse_args() # Crawler c = Crawler(args.website, int(args.l)) crawler_state = c # Register Ctrl+C signal.signal(signal.SIGINT, signal_handler) if not args.resume: info = c.crawl(args.website) else: #recover state print "recovering..." raw = loadfile('state.data') data = pickle.loads(raw) c.output = data.output c.queue = data.queue c.queue.append((data.current_url, data.level)) c.visited = data.visited c.url = data.url c.maxdepth = data.maxdepth info = c.crawl(data.url) print "OK" # Save data savefile('structure.txt', info)
def load_data(dataset_str): names = [['ent_ids_1', 'ent_ids_2'], ['training_attrs_1', 'training_attrs_2'], ['triples_1', 'triples_2'], ['ref_ent_ids']] for fns in names: for i in range(len(fns)): fns[i] = 'data/' + dataset_str + '/' + fns[i] Es, As, Ts, ill = names ill = ill[0] ILL = loadfile(ill, 2) illL = len(ILL) np.random.shuffle(ILL) train = np.array(ILL[:illL // 10 * FLAGS.seed]) test = ILL[illL // 10 * FLAGS.seed:] kg1 = get_ent2id([Es[0]]) kg1_list = list(kg1.values()) kg2 = get_ent2id([Es[1]]) kg2_list = list(kg2.values()) save_train_test_into_txt(np.asarray(test), 'test.txt') _, new_train = generate_fake(train, test, kg1_list, kg2_list) save_train_test_into_txt(new_train, 'new_train.txt')
def main(args): # loading configurations with open(args.config) as f: config = yaml.safe_load(f)["configuration"] name = config["Name"] # Construct or load embeddings print("Initializing embeddings ...") vocab_size = config["embeddings"]["vocab_size"] embed_size = config["embeddings"]["embed_size"] embeddings = init_embeddings(vocab_size, embed_size, name=name) print("\tDone.") # Build the model and compute losses source_ids = tf.placeholder(tf.int32, [None, None], name="source") target_ids = tf.placeholder(tf.int32, [None, None], name="target") sequence_mask = tf.placeholder(tf.bool, [None, None], name="mask") choice_qs = tf.placeholder(tf.float32, [None, None], name="choice") emo_cat = tf.placeholder(tf.int32, [None], name="emotion_category") (enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, dec_num_layers, dec_num_units, dec_cell_type, state_pass, num_emo, emo_cat_units, emo_int_units, infer_batch_size, beam_size, max_iter, attn_num_units, l2_regularize) = get_ECM_config(config) print("Building model architecture ...") CE, loss, train_outs, infer_outputs = compute_ECM_loss( source_ids, target_ids, sequence_mask, choice_qs, embeddings, enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, dec_num_layers, dec_num_units, dec_cell_type, state_pass, num_emo, emo_cat, emo_cat_units, emo_int_units, infer_batch_size, beam_size, max_iter, attn_num_units, l2_regularize, name) print("\tDone.") # Even if we restored the model, we will treat it as new training # if the trained model is written into an arbitrary location. (logdir, restore_from, learning_rate, gpu_fraction, max_checkpoints, train_steps, batch_size, print_every, checkpoint_every, s_filename, t_filename, q_filename, c_filename, s_max_leng, t_max_leng, dev_s_filename, dev_t_filename, dev_q_filename, dev_c_filename, loss_fig, perp_fig) = get_ECM_training_config(config) is_overwritten_training = logdir != restore_from optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-4) trainable = tf.trainable_variables() optim = optimizer.minimize(loss, var_list=trainable) # Set up session gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=max_checkpoints) try: saved_global_step = load(saver, sess, restore_from) if is_overwritten_training or saved_global_step is None: # The first training step will be saved_global_step + 1, # therefore we put -1 here for new or overwritten trainings. saved_global_step = -1 except Exception: print("Something went wrong while restoring checkpoint. " "Training is terminated to avoid the overwriting.") raise # ##### Training ##### # Load data print("Loading data ...") # id_0, id_1, id_2 preserved for SOS, EOS, constant zero padding embed_shift = 3 source_data = loadfile(s_filename, is_source=True, max_length=s_max_leng) + embed_shift target_data = loadfile(t_filename, is_source=False, max_length=t_max_leng) + embed_shift choice_data = loadfile(q_filename, is_source=False, max_length=t_max_leng) choice_data[choice_data < 0] = 0 choice_data = choice_data.astype(np.float32) category_data = pd.read_csv(c_filename, header=None, index_col=None, dtype=int)[0].values masks = (target_data != -1) n_data = len(source_data) dev_source_data = None if dev_s_filename is not None: dev_source_data = loadfile(dev_s_filename, is_source=True, max_length=s_max_leng) + embed_shift dev_target_data = loadfile(dev_t_filename, is_source=False, max_length=t_max_leng) + embed_shift dev_choice_data = loadfile(dev_q_filename, is_source=False, max_length=t_max_leng) dev_choice_data[dev_choice_data < 0] = 0 dev_choice_data = dev_choice_data.astype(np.float32) dev_category_data = pd.read_csv(dev_c_filename, header=None, index_col=None, dtype=int)[0].values dev_masks = (dev_target_data != -1) print("\tDone.") # Training last_saved_step = saved_global_step num_steps = saved_global_step + train_steps losses = [] steps = [] perps = [] dev_perps = [] print("Start training ...") try: for step in range(saved_global_step + 1, num_steps): start_time = time.time() rand_indexes = np.random.choice(n_data, batch_size) source_batch = source_data[rand_indexes] target_batch = target_data[rand_indexes] choice_batch = choice_data[rand_indexes] emotions = category_data[rand_indexes] mask_batch = masks[rand_indexes] feed_dict = { source_ids: source_batch, target_ids: target_batch, choice_qs: choice_batch, emo_cat: emotions, sequence_mask: mask_batch, } loss_value, _ = sess.run([loss, optim], feed_dict=feed_dict) losses.append(loss_value) duration = time.time() - start_time if step % print_every == 0: # train perplexity t_perp = compute_perplexity(sess, CE, mask_batch, feed_dict) perps.append(t_perp) # dev perplexity dev_str = "" if dev_source_data is not None: dev_inds = np.random.choice(len(dev_source_data), batch_size) dev_feed_dict = { source_ids: dev_source_data[dev_inds], target_ids: dev_target_data[dev_inds], choice_qs: dev_choice_data[dev_inds], emo_cat: dev_category_data[dev_inds], sequence_mask: dev_masks[dev_inds], } dev_perp = compute_perplexity(sess, CE, dev_masks[dev_inds], dev_feed_dict) dev_perps.append(dev_perp) dev_str = "dev_prep: {:.3f}, ".format(dev_perp) steps.append(step) info = 'step {:d}, loss = {:.6f}, ' info += 'perp: {:.3f}, {}({:.3f} sec/step)' print(info.format(step, loss_value, t_perp, dev_str, duration)) if step % checkpoint_every == 0: save(saver, sess, logdir, step) last_saved_step = step except KeyboardInterrupt: # Introduce a line break after ^C so save message is on its own line. print() finally: if step > last_saved_step: save(saver, sess, logdir, step) # plot loss plt.figure() plt.plot(losses) plt.title("Total loss") plt.xlabel("step") plt.savefig(loss_fig) # plot perplexity plt.figure() if len(perps) > len(steps): perps.pop() plt.plot(steps[5:], perps[5:], label="train") if dev_source_data is not None: plt.plot(steps[5:], dev_perps[5:], label="dev") plt.title("Perplexity") plt.xlabel("step") plt.legend() plt.savefig(perp_fig)
def main(args): # loading configurations with open(args.config) as f: config = yaml.safe_load(f)["configuration"] name = config["Name"] # Construct or load embeddings print("Initializing embeddings ...") vocab_size = config["embeddings"]["vocab_size"] embed_size = config["embeddings"]["embed_size"] embeddings = init_embeddings(vocab_size, embed_size, name=name) print("\tDone.") # Build the model and compute losses source_ids = tf.placeholder(tf.int32, [None, None], name="source") target_ids = tf.placeholder(tf.int32, [None, None], name="target") sequence_mask = tf.placeholder(tf.bool, [None, None], name="mask") choice_qs = tf.placeholder(tf.float32, [None, None], name="choice") emo_cat = tf.placeholder(tf.int32, [None], name="emotion_category") (enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, dec_num_layers, dec_num_units, dec_cell_type, state_pass, num_emo, emo_cat_units, emo_int_units, infer_batch_size, beam_size, max_iter, attn_num_units, l2_regularize) = get_ECM_config(config) print("Building model architecture ...") CE, loss, train_outs, infer_outputs = compute_ECM_loss( source_ids, target_ids, sequence_mask, choice_qs, embeddings, enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, dec_num_layers, dec_num_units, dec_cell_type, state_pass, num_emo, emo_cat, emo_cat_units, emo_int_units, infer_batch_size, beam_size, max_iter, attn_num_units, l2_regularize, name) print("\tDone.") # Set up session restore_from = config["training"]["restore_from"] gpu_fraction = config["training"]["gpu_fraction"] gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction) sess = tf.Session(config=tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.trainable_variables()) try: saved_global_step = load(saver, sess, restore_from) if saved_global_step is None: raise ValueError("Cannot find the checkpoint to restore from.") except Exception: print("Something went wrong while restoring checkpoint. ") raise # ##### Inference ##### # Load data print("Loading inference data ...") # id_0, id_1, id_2 preserved for SOS, EOS, constant zero padding embed_shift = 3 filename = config["inference"]["infer_source_file"] c_filename = config["inference"]["infer_category_file"] max_leng = config["inference"]["infer_source_max_length"] source_data = loadfile(filename, is_source=True, max_length=max_leng) + embed_shift category_data = pd.read_csv(c_filename, header=None, index_col=None, dtype=int)[0].values print("\tDone.") # Inference print("Start inferring ...") final_result = [] n_data = source_data.shape[0] n_pad = n_data % infer_batch_size if n_pad > 0: n_pad = infer_batch_size - n_pad pad = np.zeros((n_pad, max_leng), dtype=np.int32) source_data = np.concatenate((source_data, pad)) category_data = np.concatenate((category_data, np.zeros(n_pad))) for ith in range(int(len(source_data) / infer_batch_size)): start = ith * infer_batch_size end = (ith + 1) * infer_batch_size batch = source_data[start:end] batch_cat = category_data[start:end] result = sess.run(infer_outputs, feed_dict={ source_ids: batch, emo_cat: batch_cat }) result = result.ids[:, :, 0] if result.shape[1] < max_iter: l_pad = max_iter - result.shape[1] result = np.concatenate((result, np.ones( (infer_batch_size, l_pad))), axis=1) final_result.append(result) final_result = np.concatenate(final_result)[:n_data] - embed_shift choice_pred = (final_result >= vocab_size).astype(np.int) final_result[final_result >= vocab_size] -= (vocab_size + embed_shift) # transform to output format final_result[final_result < 0] = -1 final_result = (final_result.astype(int)).astype(str).tolist() final_result = list(map(lambda t: " ".join(t), final_result)) choice_pred = choice_pred.astype(str).tolist() choice_pred = list(map(lambda t: " ".join(t), choice_pred)) df = pd.DataFrame(data={"0": final_result}) df.to_csv(config["inference"]["output_path"], header=None, index=None) cdf = pd.DataFrame(data={"0": choice_pred}) cdf.to_csv(config["inference"]["choice_path"], header=None, index=None) print("\tDone.")
def firstnames(): filename = os.path.join(utils.root(), 'names', 'firstnames.csv') return utils.loadfile(filename, _format='split')
def city_data(): filename = os.path.join(utils.root(), 'addresses', 'cityinfo.json') return utils.loadfile(filename, _format='json')