def _get_data(self):
     try:
         d1, self.fmt = utils.loadfile(filename=self.obj1)
         d2, self.fmt = utils.loadfile(filename=self.obj2)
     except Exception, e:
         (retcode, out,
          err) = utils.runCommand("diff -u %s %s" % (self.obj1, self.obj2))
         if out:
             out = re.sub(r"%s.*" % (self.obj1), self.obj2, out)
             out = re.sub(r"%s.*" % (self.obj2), self.obj2, out)
         if err:
             err = re.sub(r"%s.*" % (self.obj1), self.obj2, err)
             err = re.sub(r"%s.*" % (self.obj2), self.obj2, err)
         return 1, out, err
Example #2
0
def main():

    # Arguments logic
    parser = argparse.ArgumentParser(description='Web crawler')
    parser.add_argument('website', nargs='?', action='store', help='website to crawl')
    parser.add_argument('-l', action='store', default=2, help='maximum depth level to crawl')
    parser.add_argument('-resume', action='store_const', const=32, help='resume crawler')

    global crawler_state
    
    # Arguments parser
    args = parser.parse_args()
    
    # Crawler
    c = Crawler(args.website, int(args.l))
    crawler_state = c
    
    # Register Ctrl+C
    signal.signal(signal.SIGINT, signal_handler)
    
    if not args.resume:
        info = c.crawl(args.website)
    else:
        #recover state
        print "recovering..."
        raw = loadfile('state.data')
        data = pickle.loads(raw)
        c.output = data.output
        
        c.queue = data.queue        
        c.queue.append((data.current_url, data.level))
        
        c.visited = data.visited
        c.url = data.url
        c.maxdepth = data.maxdepth
        info = c.crawl(data.url)
        print "OK"
        
    
    # Save data
    savefile('structure.txt', info)
Example #3
0
def load_data(dataset_str):
    names = [['ent_ids_1', 'ent_ids_2'],
             ['training_attrs_1', 'training_attrs_2'],
             ['triples_1', 'triples_2'], ['ref_ent_ids']]
    for fns in names:
        for i in range(len(fns)):
            fns[i] = 'data/' + dataset_str + '/' + fns[i]
    Es, As, Ts, ill = names
    ill = ill[0]
    ILL = loadfile(ill, 2)
    illL = len(ILL)
    np.random.shuffle(ILL)
    train = np.array(ILL[:illL // 10 * FLAGS.seed])
    test = ILL[illL // 10 * FLAGS.seed:]

    kg1 = get_ent2id([Es[0]])
    kg1_list = list(kg1.values())
    kg2 = get_ent2id([Es[1]])
    kg2_list = list(kg2.values())

    save_train_test_into_txt(np.asarray(test), 'test.txt')
    _, new_train = generate_fake(train, test, kg1_list, kg2_list)
    save_train_test_into_txt(new_train, 'new_train.txt')
Example #4
0
def main(args):
    # loading configurations
    with open(args.config) as f:
        config = yaml.safe_load(f)["configuration"]

    name = config["Name"]

    # Construct or load embeddings
    print("Initializing embeddings ...")
    vocab_size = config["embeddings"]["vocab_size"]
    embed_size = config["embeddings"]["embed_size"]
    embeddings = init_embeddings(vocab_size, embed_size, name=name)
    print("\tDone.")

    # Build the model and compute losses
    source_ids = tf.placeholder(tf.int32, [None, None], name="source")
    target_ids = tf.placeholder(tf.int32, [None, None], name="target")
    sequence_mask = tf.placeholder(tf.bool, [None, None], name="mask")
    choice_qs = tf.placeholder(tf.float32, [None, None], name="choice")
    emo_cat = tf.placeholder(tf.int32, [None], name="emotion_category")

    (enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, dec_num_layers,
     dec_num_units, dec_cell_type, state_pass, num_emo, emo_cat_units,
     emo_int_units, infer_batch_size, beam_size, max_iter, attn_num_units,
     l2_regularize) = get_ECM_config(config)

    print("Building model architecture ...")
    CE, loss, train_outs, infer_outputs = compute_ECM_loss(
        source_ids, target_ids, sequence_mask, choice_qs, embeddings,
        enc_num_layers, enc_num_units, enc_cell_type, enc_bidir,
        dec_num_layers, dec_num_units, dec_cell_type, state_pass, num_emo,
        emo_cat, emo_cat_units, emo_int_units, infer_batch_size, beam_size,
        max_iter, attn_num_units, l2_regularize, name)
    print("\tDone.")

    # Even if we restored the model, we will treat it as new training
    # if the trained model is written into an arbitrary location.
    (logdir, restore_from, learning_rate, gpu_fraction, max_checkpoints,
     train_steps, batch_size, print_every, checkpoint_every, s_filename,
     t_filename, q_filename, c_filename, s_max_leng, t_max_leng,
     dev_s_filename, dev_t_filename, dev_q_filename, dev_c_filename, loss_fig,
     perp_fig) = get_ECM_training_config(config)

    is_overwritten_training = logdir != restore_from

    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                       epsilon=1e-4)
    trainable = tf.trainable_variables()
    optim = optimizer.minimize(loss, var_list=trainable)

    # Set up session
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                            gpu_options=gpu_options))
    init = tf.global_variables_initializer()
    sess.run(init)

    # Saver for storing checkpoints of the model.
    saver = tf.train.Saver(var_list=tf.trainable_variables(),
                           max_to_keep=max_checkpoints)

    try:
        saved_global_step = load(saver, sess, restore_from)
        if is_overwritten_training or saved_global_step is None:
            # The first training step will be saved_global_step + 1,
            # therefore we put -1 here for new or overwritten trainings.
            saved_global_step = -1

    except Exception:
        print("Something went wrong while restoring checkpoint. "
              "Training is terminated to avoid the overwriting.")
        raise

    # ##### Training #####
    # Load data
    print("Loading data ...")

    # id_0, id_1, id_2 preserved for SOS, EOS, constant zero padding
    embed_shift = 3

    source_data = loadfile(s_filename, is_source=True,
                           max_length=s_max_leng) + embed_shift
    target_data = loadfile(t_filename, is_source=False,
                           max_length=t_max_leng) + embed_shift

    choice_data = loadfile(q_filename, is_source=False, max_length=t_max_leng)
    choice_data[choice_data < 0] = 0
    choice_data = choice_data.astype(np.float32)

    category_data = pd.read_csv(c_filename,
                                header=None,
                                index_col=None,
                                dtype=int)[0].values

    masks = (target_data != -1)
    n_data = len(source_data)

    dev_source_data = None
    if dev_s_filename is not None:
        dev_source_data = loadfile(dev_s_filename,
                                   is_source=True,
                                   max_length=s_max_leng) + embed_shift
        dev_target_data = loadfile(dev_t_filename,
                                   is_source=False,
                                   max_length=t_max_leng) + embed_shift

        dev_choice_data = loadfile(dev_q_filename,
                                   is_source=False,
                                   max_length=t_max_leng)
        dev_choice_data[dev_choice_data < 0] = 0
        dev_choice_data = dev_choice_data.astype(np.float32)

        dev_category_data = pd.read_csv(dev_c_filename,
                                        header=None,
                                        index_col=None,
                                        dtype=int)[0].values

        dev_masks = (dev_target_data != -1)
    print("\tDone.")

    # Training
    last_saved_step = saved_global_step
    num_steps = saved_global_step + train_steps
    losses = []
    steps = []
    perps = []
    dev_perps = []

    print("Start training ...")
    try:
        for step in range(saved_global_step + 1, num_steps):
            start_time = time.time()
            rand_indexes = np.random.choice(n_data, batch_size)
            source_batch = source_data[rand_indexes]
            target_batch = target_data[rand_indexes]
            choice_batch = choice_data[rand_indexes]
            emotions = category_data[rand_indexes]
            mask_batch = masks[rand_indexes]

            feed_dict = {
                source_ids: source_batch,
                target_ids: target_batch,
                choice_qs: choice_batch,
                emo_cat: emotions,
                sequence_mask: mask_batch,
            }

            loss_value, _ = sess.run([loss, optim], feed_dict=feed_dict)
            losses.append(loss_value)

            duration = time.time() - start_time

            if step % print_every == 0:
                # train perplexity
                t_perp = compute_perplexity(sess, CE, mask_batch, feed_dict)
                perps.append(t_perp)

                # dev perplexity
                dev_str = ""
                if dev_source_data is not None:
                    dev_inds = np.random.choice(len(dev_source_data),
                                                batch_size)

                    dev_feed_dict = {
                        source_ids: dev_source_data[dev_inds],
                        target_ids: dev_target_data[dev_inds],
                        choice_qs: dev_choice_data[dev_inds],
                        emo_cat: dev_category_data[dev_inds],
                        sequence_mask: dev_masks[dev_inds],
                    }

                    dev_perp = compute_perplexity(sess, CE,
                                                  dev_masks[dev_inds],
                                                  dev_feed_dict)
                    dev_perps.append(dev_perp)
                    dev_str = "dev_prep: {:.3f}, ".format(dev_perp)

                steps.append(step)
                info = 'step {:d}, loss = {:.6f}, '
                info += 'perp: {:.3f}, {}({:.3f} sec/step)'
                print(info.format(step, loss_value, t_perp, dev_str, duration))

            if step % checkpoint_every == 0:
                save(saver, sess, logdir, step)
                last_saved_step = step

    except KeyboardInterrupt:
        # Introduce a line break after ^C so save message is on its own line.
        print()

    finally:
        if step > last_saved_step:
            save(saver, sess, logdir, step)

        # plot loss
        plt.figure()
        plt.plot(losses)
        plt.title("Total loss")
        plt.xlabel("step")
        plt.savefig(loss_fig)

        # plot perplexity
        plt.figure()
        if len(perps) > len(steps):
            perps.pop()
        plt.plot(steps[5:], perps[5:], label="train")
        if dev_source_data is not None:
            plt.plot(steps[5:], dev_perps[5:], label="dev")
        plt.title("Perplexity")
        plt.xlabel("step")
        plt.legend()
        plt.savefig(perp_fig)
Example #5
0
def main(args):
    # loading configurations
    with open(args.config) as f:
        config = yaml.safe_load(f)["configuration"]

    name = config["Name"]

    # Construct or load embeddings
    print("Initializing embeddings ...")
    vocab_size = config["embeddings"]["vocab_size"]
    embed_size = config["embeddings"]["embed_size"]
    embeddings = init_embeddings(vocab_size, embed_size, name=name)
    print("\tDone.")

    # Build the model and compute losses
    source_ids = tf.placeholder(tf.int32, [None, None], name="source")
    target_ids = tf.placeholder(tf.int32, [None, None], name="target")
    sequence_mask = tf.placeholder(tf.bool, [None, None], name="mask")
    choice_qs = tf.placeholder(tf.float32, [None, None], name="choice")
    emo_cat = tf.placeholder(tf.int32, [None], name="emotion_category")

    (enc_num_layers, enc_num_units, enc_cell_type, enc_bidir, dec_num_layers,
     dec_num_units, dec_cell_type, state_pass, num_emo, emo_cat_units,
     emo_int_units, infer_batch_size, beam_size, max_iter, attn_num_units,
     l2_regularize) = get_ECM_config(config)

    print("Building model architecture ...")
    CE, loss, train_outs, infer_outputs = compute_ECM_loss(
        source_ids, target_ids, sequence_mask, choice_qs, embeddings,
        enc_num_layers, enc_num_units, enc_cell_type, enc_bidir,
        dec_num_layers, dec_num_units, dec_cell_type, state_pass, num_emo,
        emo_cat, emo_cat_units, emo_int_units, infer_batch_size, beam_size,
        max_iter, attn_num_units, l2_regularize, name)
    print("\tDone.")

    # Set up session
    restore_from = config["training"]["restore_from"]
    gpu_fraction = config["training"]["gpu_fraction"]
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)
    sess = tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                            gpu_options=gpu_options))
    init = tf.global_variables_initializer()
    sess.run(init)

    # Saver for storing checkpoints of the model.
    saver = tf.train.Saver(var_list=tf.trainable_variables())

    try:
        saved_global_step = load(saver, sess, restore_from)
        if saved_global_step is None:
            raise ValueError("Cannot find the checkpoint to restore from.")

    except Exception:
        print("Something went wrong while restoring checkpoint. ")
        raise

    # ##### Inference #####
    # Load data
    print("Loading inference data ...")

    # id_0, id_1, id_2 preserved for SOS, EOS, constant zero padding
    embed_shift = 3
    filename = config["inference"]["infer_source_file"]
    c_filename = config["inference"]["infer_category_file"]
    max_leng = config["inference"]["infer_source_max_length"]

    source_data = loadfile(filename, is_source=True,
                           max_length=max_leng) + embed_shift
    category_data = pd.read_csv(c_filename,
                                header=None,
                                index_col=None,
                                dtype=int)[0].values
    print("\tDone.")

    # Inference
    print("Start inferring ...")
    final_result = []
    n_data = source_data.shape[0]
    n_pad = n_data % infer_batch_size
    if n_pad > 0:
        n_pad = infer_batch_size - n_pad

    pad = np.zeros((n_pad, max_leng), dtype=np.int32)
    source_data = np.concatenate((source_data, pad))
    category_data = np.concatenate((category_data, np.zeros(n_pad)))

    for ith in range(int(len(source_data) / infer_batch_size)):
        start = ith * infer_batch_size
        end = (ith + 1) * infer_batch_size
        batch = source_data[start:end]
        batch_cat = category_data[start:end]

        result = sess.run(infer_outputs,
                          feed_dict={
                              source_ids: batch,
                              emo_cat: batch_cat
                          })
        result = result.ids[:, :, 0]

        if result.shape[1] < max_iter:
            l_pad = max_iter - result.shape[1]
            result = np.concatenate((result, np.ones(
                (infer_batch_size, l_pad))),
                                    axis=1)

        final_result.append(result)

    final_result = np.concatenate(final_result)[:n_data] - embed_shift
    choice_pred = (final_result >= vocab_size).astype(np.int)
    final_result[final_result >= vocab_size] -= (vocab_size + embed_shift)

    # transform to output format
    final_result[final_result < 0] = -1
    final_result = (final_result.astype(int)).astype(str).tolist()
    final_result = list(map(lambda t: " ".join(t), final_result))

    choice_pred = choice_pred.astype(str).tolist()
    choice_pred = list(map(lambda t: " ".join(t), choice_pred))

    df = pd.DataFrame(data={"0": final_result})
    df.to_csv(config["inference"]["output_path"], header=None, index=None)

    cdf = pd.DataFrame(data={"0": choice_pred})
    cdf.to_csv(config["inference"]["choice_path"], header=None, index=None)
    print("\tDone.")
Example #6
0
def firstnames():
    filename = os.path.join(utils.root(), 'names', 'firstnames.csv')
    return utils.loadfile(filename, _format='split')
def city_data():
    filename = os.path.join(utils.root(), 'addresses', 'cityinfo.json')
    return utils.loadfile(filename, _format='json')