Example #1
0
    def createWebApp(cls, name, url):
        """Create a new application by name and url."""
        if len(name.strip()) == 0:
            raise ValueError("Name is empty!")

        import BeautifulSoup
        soup = BeautifulSoup.BeautifulSoup(
                urllib2.urlopen(
                    urllib2.Request(
                        url,
                        headers={'User-Agent':
                            "Mozilla/5.0 (X11; U; Linux i686) " +
                            "Gecko/20071127 Firefox/2.0.0.11"
                        })))
        appJson = {
            'uuid': str(uuid.uuid4()),
            'name': name,
            'url': url,
            'icon': 'icon.png',
            'size': [800, 600],
        }
        icon_url = soup.find("link", rel="apple-touch-icon")
        if icon_url is None:
            icon_url = soup.find("link", rel="shortcut icon")
            appJson['icon'] = 'icon.ico'
        if icon_url:
            appJson['icon-url'] = icon_url['href']

        app_dir = cls.get_local_apps_dir(appJson['uuid'])
        utils.ensure_dir_exists(app_dir)
        with open("%s/%s" % (app_dir, 'app.json'), 'w') as f:
            json.dump(appJson, f, ensure_ascii=False, indent=4)
Example #2
0
    def save_current_plot_to_file(self, plot_fn):
        import utils
        import os.path
        utils.ensure_dir_exists(os.path.dirname(plot_fn))

        import matplotlib.pyplot as plt
        plt.savefig(plot_fn, format='pdf')
        print "Saved plot to %s" % (plot_fn)
Example #3
0
def make_all_zips(count=50, target_dir='zips', pool_size=cpu_count()):
    ensure_dir_exists(target_dir)
    filenames = [
        os.path.join(target_dir, '%d.zip' % index)
        for index in range(count)
    ]
    with ProcessPoolExecutor(pool_size) as executor:
        return executor.map(make_zip, filenames)
Example #4
0
def save_current_plot(graph_name, plot_name, method_name, res_num):
    import utils
    import os.path
    import matplotlib.pyplot as plt

    plot_dir = "plots/%s" % (graph_name)
    plot_fn = "%s/%s_%s_r%d.pdf" % (plot_dir, plot_name, method_name, res_num)

    utils.ensure_dir_exists(os.path.dirname(plot_fn))
    plt.savefig(plot_fn, format="pdf")
    print "Saved plot to %s" % (plot_fn)
Example #5
0
    def get_user_file(self, file_name, make_dir=True, touch_file=False):
        user_dir = self.get_user_dir()
        full_path = os.path.join(user_dir, file_name)
        if make_dir:
            full_dir = os.path.dirname(full_path)
            utils.ensure_dir_exists(full_dir)

        if touch_file and not os.path.exists(full_path):
            logger.info("Creating file %s" % full_path)
            os.mknod(full_path)
        return full_path
Example #6
0
def make_csvs(source_dir='zips', target_dir='csvs', pool_size=cpu_count()):
    ensure_dir_exists(target_dir)

    zip_filenames = glob(os.path.join(source_dir, '*.zip'))

    f1, roots = open_csv_writer(os.path.join(target_dir, 'roots.csv'))
    f2, objects = open_csv_writer(os.path.join(target_dir, 'objects.csv'))

    try:
        roots.writerow(('id', 'level'))
        objects.writerow(('id', 'object_name'))

        with ProcessPoolExecutor(pool_size) as executor:
            ngdata_per_zip = executor.map(extract_ngdata_from_zip, zip_filenames)
            for ngdata in iflatten(ngdata_per_zip):
                roots.writerow((ngdata.id, ngdata.level))
                for object_name in ngdata.object_names:
                    objects.writerow((ngdata.id, object_name))
    finally:
        f1.close()
        f2.close()
Example #7
0
        poison_rates = [
            float(x) for x in args.poison_rates.replace(" ", "").split(',')
        ]
        assert len(poison_rates) > 0, 'Provide at least one theta value'
        args.poison_rates = poison_rates
    except ValueError:
        raise ValueError("Theta values not provided in correct format")

    # Before saving anything, make sure directory exists
    model_dir = os.path.join(
        args.dir_prefix, "{}/target/"
        "arch-{}_target-{}_goal-{}_"
        "rule-{}/loss-{}".format(args.dataset, args.model_arch,
                                 args.poison_class, args.attacker_goal,
                                 args.c_rule, args.loss))
    utils.ensure_dir_exists(model_dir)

    og_save_poisoned_data = args.save_poisoned_data

    best_model_obj, best_loss = None, np.inf
    for ratio in poison_rates:
        if args.low_confidence and ratio > 1:
            raise ValueError("Highest-loss selection with ratio > 1 "
                             "makes no sense")

        # Make sure data is saved
        if og_save_poisoned_data is not None:
            args.save_poisoned_data = os.path.join(
                og_save_poisoned_data,
                "seed_{}/ratio_{}".format(args.seed, ratio))
            utils.ensure_dir_exists(args.save_poisoned_data)
Example #8
0
 def get_level_path(self, level):
     path = self.base_path + "/" + self.get_level_textual(level) + ".log"
     utils.ensure_dir_exists(self.base_path)
     return path
Example #9
0
def run(sess, f, data):
    # load data that will be used for evaluating the distillation process
    eval_data = d.get(f.eval_dataset, f)

    # load teacher graph
    _, output_size = data.io_shape
    inputs, teacher_outputs, _, teacher_feed_dicts = m.get(f.model).load_model(sess, f.model_meta, f.model_checkpoint, output_size)
    teacher_outputs = tf.stop_gradient(tf.nn.softmax(teacher_outputs))

    # create student graph
    outputs, _, feed_dicts = m.get(f.model).create_model(inputs, output_size)

    loss, train_step = create_train_ops(outputs, teacher_outputs, lr=f.lr, loss=f.loss)
    accuracy = create_eval_ops(outputs, teacher_outputs)
    summary_op = create_summary_ops(loss, accuracy)

    # only initialize non-initialized vars:
    u.init_uninitted_vars(sess)
    # (this is very important in distill: we don't want to reinit teacher model)

    saver = tf.train.Saver(tf.global_variables())

    summary_dir = os.path.join(f.summary_folder, f.run_name, 'distill')
    train_writer = tf.summary.FileWriter(os.path.join(summary_dir, 'train'), sess.graph)
    trainbatch_writer = tf.summary.FileWriter(os.path.join(summary_dir, 'train_batch'), sess.graph)
    test_writer = tf.summary.FileWriter(os.path.join(summary_dir, 'test'), sess.graph)

    with sess.as_default():
        global_step = 0

        print('Note: accuracies here are how much the student correlates to the teacher.]')
        print('For true set accuracy, multiply by teacher\'s accuracy.')

        for i in range(f.epochs):
            print('Epoch: {}'.format(i))
            for batch_x, _ in data.train_epoch_in_batches(f.train_batch_size):
                # train step. we don't need to feed batch_y because the student
                # is being trained to mimic the teacher's temperature-scaled
                # activations.
                summary, _ = sess.run([summary_op, train_step],
                        feed_dict={**teacher_feed_dicts['distill'],
                                   **feed_dicts['distill'],
                                   inputs: batch_x})
                trainbatch_writer.add_summary(summary, global_step)

                if global_step % f.eval_interval == 0:
                    # eval test
                    summaries = []
                    for test_batch_x, test_batch_y in eval_data.test_epoch_in_batches(f.test_batch_size):
                        summary = sess.run(summary_op,
                                feed_dict={**teacher_feed_dicts['distill'],
                                           **feed_dicts['distill'],
                                           inputs: test_batch_x})
                        summaries.append(summary)
                    test_writer.add_summary(merge_summary_list(summaries, True), global_step)

                    # eval train
                    summaries = []
                    for train_batch_x, train_batch_y in data.train_epoch_in_batches(f.train_batch_size):
                        summary = sess.run(summary_op,
                                feed_dict={**teacher_feed_dicts['distill'],
                                           **feed_dicts['distill'],
                                           inputs: train_batch_x})
                        summaries.append(summary)
                    train_writer.add_summary(merge_summary_list(summaries, True), global_step)

                global_step += 1

                if global_step % f.checkpoint_interval == 0:
                    checkpoint_dir = os.path.join(summary_dir, 'checkpoint/')
                    ensure_dir_exists(checkpoint_dir)
                    checkpoint_file = os.path.join(checkpoint_dir, f.model)
                    saver.save(sess, checkpoint_file, global_step=global_step)
                    print('distilled model saved in {}'.format(checkpoint_file))

    print('distilled model saved in {}'.format(checkpoint_file))
import sys
sys.path.append('../12net')
import numpy as np
import os, cv2
import numpy.random as npr
from utils import IOU, ensure_dir_exists

anno_file = 'wider_face_train.txt'
img_dir = 'WIDER_train/images'

pos_save_dir = '../12net/12/positive'
part_save_dir = '../12net/12/part'
neg_save_dir = '../12net/12/negative'
save_dir = '../12net/12'

ensure_dir_exists(save_dir)
ensure_dir_exists(pos_save_dir)
ensure_dir_exists(part_save_dir)
ensure_dir_exists(neg_save_dir)

f1 = open(os.path.join(save_dir, 'pos_12.txt'), 'w')
f2 = open(os.path.join(save_dir, 'neg_12.txt'), 'w')
f3 = open(os.path.join(save_dir, 'part_12.txt'), 'w')

with open(anno_file, 'r') as f:
    annotations = f.readlines()

num = len(annotations)
print '%d pics in total' % num

p_idx = 0
Example #11
0
def _rsync_dir(source_dir, dest_dir):
    ensure_dir_exists(dest_dir)
    with open('.rsync_log', 'ab') as rsync_log:
        subprocess.call(['gsutil', '-m', 'rsync', source_dir, dest_dir],
                        stderr=rsync_log)
def ensure_query_file(qfile: str):
    '''
    Ensure that the file exists otherwise creates it. This file will contain an
    aggregation of all the queries executed against the inverted index.
    '''
    if os.path.isfile(qfile):
        return

    with open(qfile, mode='w') as f:
        f.write("{}")  # because we want utils.load_json_from_disk to load an empty dict instead of raising an exception


if __name__ == '__main__':

    args = init_params()
    utils.ensure_dir_exists('output')
    ensure_query_file(args.output_file)

    if args.query_string == None or type(args.query_string) != str:
        print("Please provide a query str with flag -q")
        exit()

    inv_index: Dict[str, Tuple[int, List[int]]] = utils.load_index(args.input_file)

    result: dict = exec_query(args.query_string, inv_index)
    x = result[args.query_string]
    print(f'<{args.query_string}> query was {x["message"]}: {x["frequency"]} hits found')

    queries: dict = utils.load_json_from_disk(args.output_file)

    del x['message']
                        action="store_true",
                        help='If true, print per-epoch training statistics')

    args = parser.parse_args()

    if args.verbose:
        args.verbose_pretrain = True

    try:
        wanted_errors = [float(x) for x in args.errors.split(",")]
        print(utils.red_print("Target error rates: %s" % str(wanted_errors)))
    except ValueError:
        raise ValueError("Wanted errors provided in invalid format")

    # Ensure directory exists where model will be saved
    utils.ensure_dir_exists(args.save_dir)

    # Print all arguments
    utils.flash_utils(args)

    # Prepare logger
    log_dir = os.path.join(
        args.log_path,
        "indiscriminate_" + str(args.n_copies) + "_" + str(args.seed))
    utils.ensure_dir_exists(log_dir)
    logger = SummaryWriter(log_dir=log_dir, flush_secs=10)

    print(utils.pink_print("Running attack"))
    indiscriminateAttack(logger, wanted_errors, args)

    # Close logger
Example #14
0
def test(**kwargs):
    params = Params(kwargs)

    print('Params:')
    params.pretty_print()
    print()

    use_cuda = params.use_cuda
    if use_cuda:
        assert torch.cuda.is_available()

    with Timer('Loading models'):
        gen_a_to_b, gen_b_to_a = load_models_for_evaluation(
            params.checkpoint_path)

    print('#weights in gen_a_to_b:',
          natural.number.number(model_utils.compute_num_weights(gen_a_to_b)))
    print('#weights in gen_b_to_a:',
          natural.number.number(model_utils.compute_num_weights(gen_b_to_a)))

    if use_cuda:
        gen_a_to_b.cuda()
        gen_b_to_a.cuda()

    a_to_b_save_path = join_path(params.test_save_path, c.A_TO_B_GEN_TEST_DIR)
    b_to_a_save_path = join_path(params.test_save_path, c.B_TO_A_GEN_TEST_DIR)

    ensure_dir_exists(a_to_b_save_path)
    ensure_dir_exists(b_to_a_save_path)

    filenames = utils.listdir(params.dataset_a, extensions=('.png', '.jpg'))
    for filename in tqdm(filenames, desc='A to B'):
        filepath = join_path(params.dataset_a, filename)

        a = image_utils.load_image(filepath)

        b_fake = generate_fake_image(image=a,
                                     generator_net=gen_a_to_b,
                                     use_cuda=use_cuda)

        root, ext = os.path.splitext(filename)
        a_filepath = join_path(a_to_b_save_path, '{}-a{}'.format(root, ext))
        skimage.io.imsave(a_filepath, a)

        a_to_b_filepath = join_path(a_to_b_save_path,
                                    '{}-a-to-b{}'.format(root, ext))
        skimage.io.imsave(a_to_b_filepath, b_fake)

    filenames = utils.listdir(params.dataset_b, extensions=('.png', '.jpg'))
    for filename in tqdm(filenames, desc='B to A'):
        filepath = join_path(params.dataset_b, filename)

        b = image_utils.load_image(filepath)

        a_fake = generate_fake_image(image=b,
                                     generator_net=gen_b_to_a,
                                     use_cuda=use_cuda)

        root, ext = os.path.splitext(filename)
        b_filepath = join_path(b_to_a_save_path, '{}-b{}'.format(root, ext))
        skimage.io.imsave(b_filepath, b)

        b_to_a_filepath = join_path(b_to_a_save_path,
                                    '{}-b-to-a{}'.format(root, ext))
        skimage.io.imsave(b_to_a_filepath, a_fake)
Example #15
0
    split_1 = ch.cat(split_1)
    split_2 = ch.cat(split_2)

    data_first = (X[split_1], Y[split_1])
    data_second = (X[split_2], Y[split_2])

    return data_first, data_second


if __name__ == "__main__":
    mnist17 = datasets.dataset_helper("mnist17")()
    train_1, train_2 = stratified_split(mnist17.train)
    val_1, val_2 = stratified_split(mnist17.val)

    # Ensure directory exists
    utils.ensure_dir_exists("./data/datasets/MNIST17/")

    # Save these files
    ch.save(
        {
            "train": {
                "data": train_1[0],
                "targets": train_1[1]
            },
            "val": {
                "data": val_1[0],
                "targets": val_1[1]
            },
        }, "./data/datasets/MNIST17/split_1.pt")

    ch.save(
from collections import Counter
import nltk
import os
from tqdm import tqdm
import utils
from utils import CACHE_DIR

TOKENIZING_REGEX = r"[a-zA-Z]+[-']{0,1}[a-zA-Z]*[']{0,1}"  # supports hyphenated and apostrophied words

utils.ensure_dir_exists(CACHE_DIR)


def custom_tokenizer(text: str, tokenizer, stem_doc=False) -> list:
    tokens = tokenizer.tokenize(text)
    stemmer = nltk.stem.PorterStemmer()

    cleaned_tokens = []

    for token in tokens:

        if token == "":
            continue

        while token[-1] == "-" or token[-1] == "'":
            token = token[:-1]

        if 1 < len(
                token
        ):  # don't bother adding single letters to the index because the smallest lastnames should be >= 2 letters
            token = token.lower()
            if stem_doc:
Example #17
0
from loader import generate_train_data
from prototypes import wavenet
from utils import ensure_dir_exists

if __name__ == '__main__':
    input_length = 4000
    epochs = 500
    ensure_dir_exists('models/')

    model = wavenet(input_length)

    for e in range(epochs):
        train_batches = generate_train_data(input_length, 1000)
        print("Epoch {}/{}:".format(e + 1, epochs))
        for i, (x, y) in enumerate(train_batches):
            model.fit(x, y, batch_size=4, epochs=1, verbose=2)

        if (e + 1) % 50 == 0:
            print("Saving intermediate model weights...")
            model.save_weights('models/tmp.h5')

    print("\nTraining complete!\nSaving model...")
    model.save_weights('models/final_weights.h5')
    print("Model saved, terminate.")
Example #18
0
def train(**kwargs):
    params = Params(kwargs)

    print('Params:')
    params.pretty_print()
    print()

    use_cuda = params.use_cuda
    if use_cuda:
        assert torch.cuda.is_available()

    with Timer('Initializing'):
        a_image_generator = create_image_generator(params.dataset_a)
        b_image_generator = create_image_generator(params.dataset_b)

        gen_a_to_b, gen_b_to_a, discr_a, discr_b = load_models_for_training(params.checkpoint_path)

        print('#weights in gen_a_to_b:', natural.number.number(model_utils.compute_num_weights(gen_a_to_b)))
        print('#weights in gen_b_to_a:', natural.number.number(model_utils.compute_num_weights(gen_b_to_a)))
        print('#weights in discr_a:', natural.number.number(model_utils.compute_num_weights(discr_a)))
        print('#weights in discr_b:', natural.number.number(model_utils.compute_num_weights(discr_b)))

        if use_cuda:
            gen_a_to_b.cuda()
            gen_b_to_a.cuda()
            discr_a.cuda()
            discr_b.cuda()

        betas = (params.adam_beta1, params.adam_beta2)
        optimizer_generators = torch.optim.Adam(params=itertools.chain(gen_a_to_b.parameters(), gen_b_to_a.parameters()),
                                                lr=params.gen_learning_rate, betas=betas)
        optimizer_discr_a = torch.optim.Adam(params=discr_a.parameters(), lr=params.discr_learning_rate, betas=betas)
        optimizer_discr_b = torch.optim.Adam(params=discr_b.parameters(), lr=params.discr_learning_rate, betas=betas)

        cycle_criterion = nn.L1Loss()
        discr_criterion = nn.MSELoss()

        one_array = torch.ones((params.batch_size, 1, 30, 30))  # Has the same size as the output of discr_a and discr_b
        if use_cuda:
            one_array = one_array.cuda()
        one_array = Variable(one_array, requires_grad=False)

        zero_array = torch.zeros((params.batch_size, 1, 30, 30))
        if use_cuda:
            zero_array = zero_array.cuda()
        zero_array = Variable(zero_array, requires_grad=False)

        a_fake_image_pool = image_utils.ImagePool(params.image_pool_size)
        b_fake_image_pool = image_utils.ImagePool(params.image_pool_size)

    header = '\t'.join(FIELD_NAMES)
    print(header)

    # Train:

    with open(params.log_filename, 'w') as csvfile:
        dict_writer = csv.DictWriter(csvfile, FIELD_NAMES)
        dict_writer.writeheader()

        for i in itertools.count():
            timer = Timer('train step', verbose=False)

            with timer:
                a = generate_batch_variable(a_image_generator, use_cuda, params.batch_size)
                b = generate_batch_variable(b_image_generator, use_cuda, params.batch_size)

                generators_loss = models.compute_generators_loss(gen_a_to_b, gen_b_to_a, discr_a, discr_b, a, b,
                                                                 cycle_criterion, discr_criterion, one_array,
                                                                 a_fake_image_pool, b_fake_image_pool)
                optimize(optimizer_generators, generators_loss)

                discr_a_loss = models.compute_discr_loss(discr_a, a, a_fake_image_pool, discr_criterion, zero_array, one_array)
                optimize(optimizer_discr_a, discr_a_loss)

                discr_b_loss = models.compute_discr_loss(discr_b, b, b_fake_image_pool, discr_criterion, zero_array, one_array)
                optimize(optimizer_discr_b, discr_b_loss)

            row = collections.OrderedDict()
            row['step'] = str(i)
            row['generators_loss'] = float_to_string(generators_loss)
            row['discr_a_loss'] = float_to_string(discr_a_loss)
            row['discr_b_loss'] = float_to_string(discr_b_loss)
            row['total_loss'] = float_to_string(generators_loss + discr_a_loss + discr_b_loss)
            row['duration'] = '{0:.2f}s'.format(timer.get_duration())
            dict_writer.writerow(row)
            print('\t'.join(row.values()))

            if i % params.save_step == 0 and i > 0:
                with Timer('Saving models'):
                    ensure_dir_exists(params.checkpoint_path)

                    torch.save(gen_a_to_b.state_dict(), join_path(params.checkpoint_path, c.A_TO_B_GEN_DIR))
                    torch.save(gen_b_to_a.state_dict(), join_path(params.checkpoint_path, c.B_TO_A_GEN_DIR))

                    torch.save(discr_a.state_dict(), join_path(params.checkpoint_path, c.A_DISCR_DIR))
                    torch.save(discr_b.state_dict(), join_path(params.checkpoint_path, c.B_DISCR_DIR))

                print(header)

            if i % params.test_step == 0 and i > 0:
                ensure_dir_exists(params.debug_path)
                with Timer('Creating debug images'):
                    a, b, b_fake, a_fake = create_debug_images(a, b, gen_a_to_b, gen_b_to_a, params.use_cuda)

                    a_filepath = join_path(params.debug_path, '{}-a.jpg'.format(i))
                    skimage.io.imsave(a_filepath, a)

                    b_filepath = join_path(params.debug_path, '{}-b.jpg'.format(i))
                    skimage.io.imsave(b_filepath, b)

                    b_fake_filepath = join_path(params.debug_path, '{}-a-to-b.jpg'.format(i))
                    skimage.io.imsave(b_fake_filepath, b_fake)

                    a_fake_filepath = join_path(params.debug_path, '{}-b-to-a.jpg'.format(i))
                    skimage.io.imsave(a_fake_filepath, a_fake)
Example #19
0
def main(unused_argv):
    """Run the reinforcement learning loop."""
    utils.ensure_dir_exists(fsdb.models_dir())
    utils.ensure_dir_exists(fsdb.selfplay_dir())
    utils.ensure_dir_exists(fsdb.holdout_dir())
    utils.ensure_dir_exists(fsdb.sgf_dir())
    utils.ensure_dir_exists(fsdb.eval_dir())
    utils.ensure_dir_exists(fsdb.golden_chunk_dir())
    utils.ensure_dir_exists(fsdb.working_dir())

    bootstrap_name = shipname.generate(0)
    bootstrap_model_path = os.path.join(fsdb.models_dir(), bootstrap_name)
    mask_flags.checked_run([
        'python3', 'bootstrap.py',
        '--export_path={}'.format(bootstrap_model_path),
        '--work_dir={}'.format(fsdb.working_dir()),
        '--flagfile=rl_loop/local_flags'
    ])

    selfplay_cmd = [
        'python3', 'selfplay.py',
        '--load_file={}'.format(bootstrap_model_path),
        '--selfplay_dir={}'.format(
            os.path.join(fsdb.selfplay_dir(),
                         bootstrap_name)), '--holdout_dir={}'.format(
                             os.path.join(fsdb.holdout_dir(), bootstrap_name)),
        '--sgf_dir={}'.format(fsdb.sgf_dir()), '--holdout_pct=0',
        '--flagfile=rl_loop/local_flags'
    ]

    # Selfplay twice
    mask_flags.checked_run(selfplay_cmd)
    mask_flags.checked_run(selfplay_cmd)
    # and once more to generate a held out game for validation
    # exploits flags behavior where if you pass flag twice, second one wins.
    mask_flags.checked_run(selfplay_cmd + ['--holdout_pct=100'])

    # Double check that at least one sgf has been generated.
    assert os.listdir(os.path.join(fsdb.sgf_dir(), 'full'))

    print("Making shuffled golden chunk from selfplay data...")
    # TODO(amj): refactor example_buffer so it can be called the same way
    # as everything else.
    eb.make_chunk_for(output_dir=fsdb.golden_chunk_dir(),
                      local_dir=fsdb.working_dir(),
                      game_dir=fsdb.selfplay_dir(),
                      model_num=1,
                      positions=64,
                      threads=8,
                      sampling_frac=1)

    tf_records = sorted(
        gfile.Glob(os.path.join(fsdb.golden_chunk_dir(), '*.tfrecord.zz')))

    trained_model_name = shipname.generate(1)
    trained_model_path = os.path.join(fsdb.models_dir(), trained_model_name)

    # Train on shuffled game data
    mask_flags.checked_run([
        'python3', 'train.py', *tf_records,
        '--work_dir={}'.format(fsdb.working_dir()),
        '--export_path={}'.format(trained_model_path),
        '--flagfile=rl_loop/local_flags'
    ])

    # Validate the trained model on held out game
    mask_flags.checked_run([
        'python3', 'validate.py',
        os.path.join(fsdb.holdout_dir(), bootstrap_name),
        '--work_dir={}'.format(fsdb.working_dir()),
        '--flagfile=rl_loop/local_flags'
    ])

    # Verify that trained model works for selfplay
    # exploits flags behavior where if you pass flag twice, second one wins.
    mask_flags.checked_run(selfplay_cmd +
                           ['--load_file={}'.format(trained_model_path)])

    mask_flags.checked_run([
        'python3', 'evaluate.py', bootstrap_model_path, trained_model_path,
        '--games=1', '--eval_sgf_dir={}'.format(fsdb.eval_dir()),
        '--flagfile=rl_loop/local_flags'
    ])
    print("Completed integration test!")
Example #20
0
def run_game(network, args, device=None, sgf_dir=None, holdout_pct=0.05):
    '''Takes a played game and record results and game data.'''
    selfplay_dir = os.path.join(args.selfplay_dir, args.model_name)
    utils.ensure_dir_exists(selfplay_dir)
    holdout_dir = os.path.join(args.holdout_dir, args.model_name)
    utils.ensure_dir_exists(holdout_dir)
    if args.sgf_dir:
        sgf_dir = os.path.join(args.sgf_dir, args.model_name)
        utils.ensure_dir_exists(sgf_dir)
    if sgf_dir is not None:
        minimal_sgf_dir = os.path.join(sgf_dir, 'clean')
        full_sgf_dir = os.path.join(sgf_dir, 'full')
        utils.ensure_dir_exists(minimal_sgf_dir)
        utils.ensure_dir_exists(full_sgf_dir)
    if selfplay_dir is not None:
        utils.ensure_dir_exists(selfplay_dir)
        utils.ensure_dir_exists(holdout_dir)

    with utils.logged_timer("Playing game"):
        player = play(network, args, device=device)

    features, pis, values = player.extract_data(return_features=True)
    features = np.array(features)
    pis = np.array(pis)
    values = np.array(values)
    assert features.shape[0] == pis.shape[0] == values.shape[0]
    output_name = '{}-{}'.format(int(time.time()), features.shape[0])
    if sgf_dir is not None:
        with open(os.path.join(minimal_sgf_dir, '{}.sgf'.format(output_name)),
                  'w') as f:
            f.write(player.to_sgf(use_comments=False))
        with open(os.path.join(full_sgf_dir, '{}.sgf'.format(output_name)),
                  'w') as f:
            f.write(player.to_sgf())

    if selfplay_dir is not None:
        # Hold out 5% of games for validation.
        if random.random() < holdout_pct:
            fname = os.path.join(holdout_dir, "{}.hdf5".format(output_name))
        else:
            fname = os.path.join(selfplay_dir, "{}.hdf5".format(output_name))

        preprocessing.save_h5_examples(fname, features, pis, values)
Example #21
0
def main(unused_argv):
  """Run the reinforcement learning loop."""

  print('Wiping dir %s' % FLAGS.base_dir, flush=True)
  shutil.rmtree(FLAGS.base_dir, ignore_errors=True)

  utils.ensure_dir_exists(fsdb.models_dir())
  utils.ensure_dir_exists(fsdb.selfplay_dir())
  utils.ensure_dir_exists(fsdb.holdout_dir())
  utils.ensure_dir_exists(fsdb.eval_dir())
  utils.ensure_dir_exists(fsdb.golden_chunk_dir())
  utils.ensure_dir_exists(fsdb.working_dir())

  # Copy the target model to the models directory so we can find it easily.
  shutil.copy('ml_perf/target.pb', fsdb.models_dir())

  logging.getLogger().addHandler(
      logging.FileHandler(os.path.join(FLAGS.base_dir, 'reinforcement.log')))
  formatter = logging.Formatter('[%(asctime)s] %(message)s',
                                '%Y-%m-%d %H:%M:%S')
  for handler in logging.getLogger().handlers:
    handler.setFormatter(formatter)

  with utils.logged_timer('Total time'):
    rl_loop()
                               (i + 1, tst_nsub_acc)))
        print()

    for valid_theta_err in args.theta_values:
        args.err_threshold = valid_theta_err

        # Prepare logger
        log_dir = os.path.join(
            args.log_path,
            str(valid_theta_err) + "_mnist17split" + "_" + args.optim_type +
            "_" + str(args.model_arch) + "_" + str(args.n_copies) + "_" +
            str(args.optim_steps) + "_" + str(args.optim_trials) + "_signed=" +
            str(args.signed) + "_dynamic_n=" + str(args.dynamic_repeat) +
            "_batch_estimate=" + str(args.batch_sample_estimate) + "_" +
            str(args.optim_lr) + "_" + str(args.seed))
        utils.ensure_dir_exists(log_dir)
        logger = SummaryWriter(log_dir=log_dir, flush_secs=10)

        print(
            utils.pink_print("Running attack for theta %.2f" %
                             valid_theta_err))

        # Get poison data
        poison_data, theta_t = modelTargetPoisoningEnsemble(
            thetas_p, logger, args)
        dp_x = ch.cat(poison_data[0], 0).numpy()
        dp_y = ch.cat(poison_data[1], 0).numpy()

        # Save this data
        poisoned_data_dir = os.path.join(os.path.join(log_dir, "poisondata"))
        np.savez(poisoned_data_dir, x=dp_x, y=dp_y)
sh.setLevel(logging.INFO)
logger.addHandler(sh)

# Load default values from config file
nlp_config = json.loads(open('nlp-config.json').read())[socket.gethostname()]
workers = nlp_config['workers']

# Allow user to configure options
parser = OptionParser()
parser.add_option("-n", "--workers", dest="workers", action="store", default=workers, help="Specify the number of worker processes to open")
(options, args) = parser.parse_args()

WORKERS = int(options.workers)

# Directory variables for query_write are set here; set vars for query_tar there
EVENT_DIR = ensure_dir_exists('/data/events/')
TEMP_EVENT_DIR = ensure_dir_exists('/data/temp_events/')
TEXT_DIR = ensure_dir_exists('/data/text/')
TEMP_TEXT_DIR = ensure_dir_exists('/data/temp_text/')

def write_text(event_file):
    """Takes event file as input, writes text from all queries contained in
    event file to TEXT_DIR, and returns a list of documents written"""
    for line in open(event_file):
        query = line.strip()
        logger.info('Writing query from %s: "%s"' % (current_process(), query))
        qi = QueryIterator('http://search-s11.prod.wikia.net:8983/solr/main/', {'query': query, 'fields': 'id,wid,html_en,indexed', 'sort': 'id asc'})
        for doc in qi:
            # Sanitize and write text
            text = '\n'.join(clean_list(doc.get('html_en', '')))
            localpath = os.path.join(TEXT_DIR, doc['id'])
Example #24
0
logger.addHandler(fh)
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
logger.addHandler(sh)

# Allow user to configure options
parser = OptionParser()
parser.add_option('-b', '--batchsize', dest='batchsize', action='store', default=500, help='Specify the maximum number of files in a .tgz batch')
parser.add_option('-l', '--local', dest='local', action='store_true', default=False, help='Specify whether to store text files locally instead of on S3')
(options, args) = parser.parse_args()

BATCHSIZE = options.batchsize
LOCAL = options.local

# Directory variables for query_tar are set here; set vars for query_write there
TEXT_DIR = ensure_dir_exists('/data/text/')
TEMP_TEXT_DIR = ensure_dir_exists('/data/temp_text/')

if not LOCAL:
    bucket = S3Connection().get_bucket('nlp-data')

if __name__ == '__main__':

    # Set to run indefinitely
    while True:

        try:
            bypass_minimum = False
            # Attempt to enforce minimum batch size, continue after 30 seconds if not
            logger.debug('Checking # of files in text directory...')
            num_text_files = len(os.listdir(TEXT_DIR))
Example #25
0
def save_data(df, outputDir, outputFile):
    ensure_dir_exists(outputDir)
    filepath = outputDir + "/" + outputFile
    print("Saving data to " + filepath)
    df.to_csv(filepath)
Example #26
0
def main(unused_argv):

    for i in range(0, NUM_LOOP):
        if i == 0:
            src_model_name = shipname.generate(0)
            fsdb.switch_base(os.path.join(base_dir, src_model_name))
            src_model_path = os.path.join(fsdb.models_dir(), src_model_name)
            bootstrap_model_path = os.path.join(fsdb.models_dir(),
                                                src_model_name)
            mask_flags.checked_run([
                'python3', 'bootstrap.py',
                '--export_path={}'.format(bootstrap_model_path),
                '--work_dir={}'.format(fsdb.working_dir()),
                '--flagfile=rl_loop/local_flags'
            ])
            dst_model_name = shipname.generate(1)
            fsdb.switch_base(os.path.join(base_dir, dst_model_name))
        else:
            src_model_name = dst_model_name
            src_model_path = os.path.join(fsdb.models_dir(), src_model_name)
            dst_model_name = shipname.generate(i + 1)
            fsdb.switch_base(os.path.join(base_dir, dst_model_name))

        utils.ensure_dir_exists(fsdb.models_dir())
        utils.ensure_dir_exists(fsdb.selfplay_dir())
        utils.ensure_dir_exists(fsdb.holdout_dir())
        utils.ensure_dir_exists(fsdb.sgf_dir())
        utils.ensure_dir_exists(fsdb.eval_dir())
        utils.ensure_dir_exists(fsdb.golden_chunk_dir())
        utils.ensure_dir_exists(fsdb.working_dir())

        #bootstrap_name = shipname.generate(0)
        #bootstrap_model_path = os.path.join(fsdb.models_dir(), bootstrap_name)

        print(src_model_name)
        print(src_model_path)
        selfplay_cmd = [
            'python3', 'selfplay.py', '--load_file={}'.format(src_model_path),
            '--selfplay_dir={}'.format(
                os.path.join(fsdb.selfplay_dir(),
                             dst_model_name)), '--holdout_dir={}'.format(
                                 os.path.join(fsdb.holdout_dir(),
                                              dst_model_name)),
            '--sgf_dir={}'.format(fsdb.sgf_dir()), '--holdout_pct=0',
            '--flagfile=rl_loop/local_flags'
        ]

        # Selfplay twice
        mask_flags.checked_run(selfplay_cmd)
        mask_flags.checked_run(selfplay_cmd)

        # and once more to generate a held out game for validation
        # exploits flags behavior where if you pass flag twice, second one wins.
        mask_flags.checked_run(selfplay_cmd + ['--holdout_pct=100'])

        # Double check that at least one sgf has been generated.
        assert os.listdir(os.path.join(fsdb.sgf_dir(), 'full'))

        print("Making shuffled golden chunk from selfplay data...")
        # TODO(amj): refactor example_buffer so it can be called the same way
        # as everything else.
        eb.make_chunk_for(output_dir=fsdb.golden_chunk_dir(),
                          local_dir=fsdb.working_dir(),
                          game_dir=fsdb.selfplay_dir(),
                          model_num=1,
                          positions=64,
                          threads=8,
                          sampling_frac=1)

        tf_records = sorted(
            gfile.Glob(os.path.join(fsdb.golden_chunk_dir(), '*.tfrecord.zz')))

        #trained_model_name = shipname.generate(1)
        trained_model_name = dst_model_name
        trained_model_path = os.path.join(fsdb.models_dir(),
                                          trained_model_name)

        # Train on shuffled game data
        mask_flags.checked_run([
            'python3', 'train.py', *tf_records,
            '--work_dir={}'.format(fsdb.working_dir()),
            '--export_path={}'.format(trained_model_path),
            '--flagfile=rl_loop/local_flags'
        ])

    print("Finished!")
Example #27
0
def main(argv):
    """Play matches between two neural nets."""
    _, black_model, white_model = argv
    utils.ensure_dir_exists(FLAGS.eval_sgf_dir)
    play_match(black_model, white_model, FLAGS.num_evaluation_games, FLAGS.eval_sgf_dir)
Example #28
0
def run_one_shot():
    '''Runs the creation of the index and stores it to disk.'''
    utils.ensure_dir_exists('output')
    utils.ensure_dir_exists('data')
    inverted_index: Dict[str, Tuple[int, set]] = from_scratch_index_creation()
    save_index_to_disk(inverted_index, outfile=init_params().output_file)
Example #29
0
def main(unused_argv):
    """Bootstrap random weights."""
    utils.ensure_dir_exists(os.path.dirname(FLAGS.export_path))
    if FLAGS.create_bootstrap:
        dual_net.bootstrap()
    dual_net.export_model(FLAGS.export_path)
Example #30
0
def swa():
    path_base = fsdb.models_dir()
    model_names = [
        "000393-lincoln",
        "000390-indus",
        "000404-hannibal",
        "000447-hawke",
        "000426-grief",
        "000431-lion",
        "000428-invincible",
        "000303-olympus",
        "000291-superb",
        "000454-victorious",
    ]
    model_names = model_names[:FLAGS.count]

    model_paths = [os.path.join(path_base, m) for m in model_names]

    # construct the graph
    features, labels = dual_net.get_inference_input()
    dual_net.model_fn(features, labels, tf.estimator.ModeKeys.PREDICT, FLAGS.flag_values_dict())

    # restore all saved weights
    meta_graph_def = meta_graph.read_meta_graph_file(model_paths[0] + '.meta')
    stored_var_names = set(
        [n.name for n in meta_graph_def.graph_def.node if n.op == 'VariableV2'])

    var_list = [v for v in tf.global_variables()
                if v.op.name in stored_var_names]
    var_list.sort(key=lambda v: v.op.name)

    print(stored_var_names)
    print(len(stored_var_names), len(var_list))

    sessions = [tf.Session() for _ in model_paths]
    saver = tf.train.Saver()
    for sess, model_path in zip(sessions, model_paths):
        saver.restore(sess, model_path)

    # Load all VariableV2s for each model.
    values = [sess.run(var_list) for sess in sessions]

    # Iterate over all variables average values from all models.
    all_assign = []
    for var, vals in zip(var_list, zip(*values)):
        print("{}x {}".format(len(vals), var))
        if var.name == "global_step:0":
            avg = vals[0]
            for val in vals:
                avg = tf.maximum(avg, val)
        else:
            avg = tf.add_n(vals) / len(vals)
            continue

        all_assign.append(tf.assign(var, avg))

    # Run all asign ops on an existing model (which has other ops and graph).
    sess = sessions[0]
    sess.run(all_assign)

    # Export a new saved model.
    ensure_dir_exists(FLAGS.data_dir)
    dest_path = os.path.join(FLAGS.data_dir, "swa-" + str(FLAGS.count))
    saver.save(sess, dest_path)
Example #31
0
 def test(self):
     utils.ensure_dir_exists(self.base_path)
     return True
Example #32
0
def run(sess, f, data):
    # create graph
    input_size, output_size = data.io_shape
    inputs = tf.placeholder(tf.float32, [None, input_size], name='inputs')
    outputs, _, feed_dicts = m.get(f.model).create_model(inputs, output_size)

    labels = tf.placeholder(tf.float32, [None, output_size], name='labels')
    loss, train_step = create_train_ops(outputs, labels, lr=f.lr, loss=f.loss)
    accuracy = create_eval_ops(outputs, labels, loss=f.loss)
    summary_op = create_summary_ops(loss, accuracy)

    # only initialize non-initialized vars:
    u.init_uninitted_vars(sess)
    # (this is not super important for training, but its very important
    # in optimize, and in distill)

    saver = tf.train.Saver(tf.global_variables())

    summary_dir = os.path.join(f.summary_folder, f.run_name, 'train')
    train_writer = tf.summary.FileWriter(os.path.join(summary_dir, 'train'),
                                         sess.graph)
    trainbatch_writer = tf.summary.FileWriter(
        os.path.join(summary_dir, 'train_batch'), sess.graph)
    test_writer = tf.summary.FileWriter(os.path.join(summary_dir, 'test'),
                                        sess.graph)

    with sess.as_default():
        global_step = 0

        for i in range(f.epochs):
            print('Epoch: {}'.format(i))
            for batch_x, batch_y in data.train_epoch_in_batches(
                    f.train_batch_size):
                summary, _ = sess.run([summary_op, train_step],
                                      feed_dict={
                                          **feed_dicts['train'], inputs:
                                          batch_x,
                                          labels: batch_y
                                      })
                trainbatch_writer.add_summary(summary, global_step)

                if global_step % f.eval_interval == 0:
                    # eval test set
                    summaries = []
                    for test_batch_x, test_batch_y in data.test_epoch_in_batches(
                            f.test_batch_size):
                        summary = sess.run(summary_op,
                                           feed_dict={
                                               **feed_dicts['eval'], inputs:
                                               test_batch_x,
                                               labels: test_batch_y
                                           })
                        summaries.append(summary)
                    test_writer.add_summary(
                        u.merge_summary_list(summaries, True), global_step)

                    # eval train set
                    summaries = []
                    for train_batch_x, train_batch_y in data.train_epoch_in_batches(
                            f.train_batch_size):
                        summary = sess.run(summary_op,
                                           feed_dict={
                                               **feed_dicts['eval'], inputs:
                                               train_batch_x,
                                               labels: train_batch_y
                                           })
                        summaries.append(summary)
                    train_writer.add_summary(
                        u.merge_summary_list(summaries, True), global_step)

                global_step += 1

                if global_step % f.checkpoint_interval == 0:
                    checkpoint_dir = os.path.join(summary_dir, 'checkpoint/')
                    u.ensure_dir_exists(checkpoint_dir)
                    checkpoint_file = os.path.join(checkpoint_dir, f.model)
                    saved_file = saver.save(sess,
                                            checkpoint_file,
                                            global_step=global_step)
                    print('saved model at {}'.format(saved_file))

    print('saved model at {}'.format(saved_file))
def main(unused_argv):
    """Run the reinforcement learning loop."""

    print('Wiping dir %s' % FLAGS.base_dir, flush=True)
    shutil.rmtree(FLAGS.base_dir, ignore_errors=True)

    utils.ensure_dir_exists(fsdb.models_dir())
    utils.ensure_dir_exists(fsdb.selfplay_dir())
    utils.ensure_dir_exists(fsdb.holdout_dir())
    utils.ensure_dir_exists(fsdb.eval_dir())
    utils.ensure_dir_exists(fsdb.golden_chunk_dir())
    utils.ensure_dir_exists(fsdb.working_dir())

    # Copy the flag files so there's no chance of them getting accidentally
    # overwritten while the RL loop is running.
    flags_dir = os.path.join(FLAGS.base_dir, 'flags')
    shutil.copytree(FLAGS.flags_dir, flags_dir)
    FLAGS.flags_dir = flags_dir

    # Copy the target model to the models directory so we can find it easily.
    shutil.copy('ml_perf/target.pb', fsdb.models_dir())

    logging.getLogger().addHandler(
        logging.FileHandler(os.path.join(FLAGS.base_dir, 'rl_loop.log')))
    formatter = logging.Formatter('[%(asctime)s] %(message)s',
                                  '%Y-%m-%d %H:%M:%S')
    for handler in logging.getLogger().handlers:
        handler.setFormatter(formatter)

    with utils.logged_timer('Total time'):
        try:
            rl_loop()
        finally:
            asyncio.get_event_loop().close()
            # skip until the first batch has completed
            if tar:
                tar.close()
                # remove text files after tarring
                shutil.rmtree(dest_dir)
                # send to aws and remove tarball
                if aws:
                    k.key = 'text_events/%s' % os.path.basename(tar_file)
                    k.set_contents_from_filename(tar_file)
                    os.remove(tar_file)
                    # send post requests for each wid covered in this batch
                    for wid in wids:
                        requests.post('http://nlp-s1:5000/wiki/%i' % wid)
                wids = []
            batch_count += 1
            dest_dir = ensure_dir_exists(TEXT_DIR + '%s_%i' % (os.path.basename(qqfile), batch_count))
            # open tarball for writing
            tar_file = dest_dir + '.tgz'
            tar = tarfile.open(tar_file, 'w:gz')
        wid = int(doc['wid'])
        if wid not in wids:
            wids.append(wid)
        # sanitize and write text
        text = '\n'.join(clean_list(doc.get('html_en', '')))
        localpath = os.path.join(dest_dir, doc['id'])
        with open(localpath, 'w') as f:
            f.write(text)
        # add text file to tarball
        tar.add(localpath, doc['id'])
        doc_count += 1
# tar the final batch and send to aws
Example #35
0
def get_operators(opts,
                  verts,
                  faces,
                  k_eig,
                  normals=None,
                  overwrite_cache=False,
                  truncate_cache=False):
    """
    See documentation for compute_operators(). This essentailly just wraps a call to compute_operators, using a cache if possible.
    All arrays are always computed using double precision for stability, then truncated to single precision floats to store on disk, and finally returned as a tensor with dtype/device matching the `verts` input.
    """

    device = verts.device
    dtype = verts.dtype
    verts_np = toNP(verts)
    faces_np = toNP(faces)
    is_cloud = faces.numel() == 0

    if (np.isnan(verts_np).any()):
        raise RuntimeError("tried to construct operators from NaN verts")

    # Check the cache directory
    # Note 1: Collisions here are exceptionally unlikely, so we could probably just use the hash...
    #         but for good measure we check values nonetheless.
    # Note 2: There is a small possibility for race conditions to lead to bucket gaps or duplicate
    #         entries in this cache. The good news is that that is totally fine, and at most slightly
    #         slows performance with rare extra cache misses.
    found = False
    if opts.eigensystem_cache_dir is not None:
        utils.ensure_dir_exists(opts.eigensystem_cache_dir)
        hash_key_str = str(utils.hash_arrays((verts_np, faces_np)))
        # print("Building operators for input with hash: " + hash_key_str)

        # Search through buckets with matching hashes.  When the loop exits, this
        # is the bucket index of the file we should write to.
        i_cache_search = 0
        while True:

            # Form the name of the file to check
            search_path = os.path.join(
                opts.eigensystem_cache_dir,
                hash_key_str + "_" + str(i_cache_search) + ".npz")

            try:
                # print('loading path: ' + str(search_path))
                npzfile = np.load(search_path, allow_pickle=True)
                cache_verts = npzfile["verts"]
                cache_faces = npzfile["faces"]
                cache_k_eig = npzfile["k_eig"].item()

                # If the cache doesn't match, keep looking
                if (not np.array_equal(verts, cache_verts)) or (
                        not np.array_equal(faces, cache_faces)):
                    i_cache_search += 1
                    print("hash collision! searching next.")
                    continue

                # If we're overwriting, or there aren't enough eigenvalues, just delete it; we'll create a new
                # entry below more eigenvalues
                if overwrite_cache or cache_k_eig < k_eig:
                    print(
                        "  overwiting / not enough eigenvalues --- recomputing"
                    )
                    os.remove(search_path)
                    break

                # This entry matches! Return it.
                found = True
                frames = npzfile["frames"]
                mass = npzfile["mass"]
                evals = npzfile["evals"][:k_eig]
                evecs = npzfile["evecs"][:, :k_eig]
                grad_from_spectral = npzfile[
                    "grad_from_spectral"][:, :k_eig, :]

                if truncate_cache and cache_k_eig > k_eig:
                    print("TRUNCATING CACHE {} --> {}".format(
                        cache_k_eig, k_eig))
                    np.savez(
                        search_path,
                        verts=verts_np,
                        frames=frames,
                        faces=faces_np,
                        k_eig=k_eig,
                        mass=mass,
                        evals=evals,
                        evecs=evecs,
                        grad_from_spectral=grad_from_spectral,
                    )

                frames = torch.from_numpy(frames).to(device=device,
                                                     dtype=dtype)
                mass = torch.from_numpy(mass).to(device=device, dtype=dtype)
                evals = torch.from_numpy(evals).to(device=device, dtype=dtype)
                evecs = torch.from_numpy(evecs).to(device=device, dtype=dtype)
                grad_from_spectral = torch.from_numpy(grad_from_spectral).to(
                    device=device, dtype=dtype)

                break

            except FileNotFoundError:
                print("  cache miss -- constructing operators")
                break

            except Exception as E:
                print("unexpected error loading file: " + str(E))
                print("-- constructing operators")
                break

    if not found:

        # No matching entry found; recompute.
        frames, mass, evals, evecs, grad_from_spectral = compute_operators(
            verts, faces, k_eig, normals=normals)

        dtype_np = np.float32

        # Store it in the cache
        if opts.eigensystem_cache_dir is not None:
            np.savez(
                search_path,
                verts=verts_np,
                frames=toNP(frames).astype(dtype_np),
                faces=faces_np,
                k_eig=k_eig,
                mass=toNP(mass).astype(dtype_np),
                evals=toNP(evals).astype(dtype_np),
                evecs=toNP(evecs).astype(dtype_np),
                grad_from_spectral=toNP(grad_from_spectral).astype(dtype_np),
            )

    return frames, mass, evals, evecs, grad_from_spectral