def main(): # # Disable requests insecure warnings # disable_insecure_warnings() # # parse arguments # parser = build_cli_parser("New Binaries with Netconns") parser.add_argument("-d", "--date-to-query", action="store", dest="date", help="New since DATE, format YYYY-MM-DD") parser.add_argument("-f", "--output-file", action="store", dest="output_file", help="output file in csv format") opts = parser.parse_args() if not opts.date: parser.print_usage() sys.exit(-1) # # Initalize the cbapi-ng # TODO get_cb_object # cb = get_cb_response_object(opts) # # Main Query # start_date = "[" + opts.date + "T00:00:00 TO *]" binary_query = cb.select(Binary).where(("host_count:[1 TO 3]" " server_added_timestamp:" + start_date + " -observed_filename:*.dll" " -digsig_publisher:Microsoft*" " -alliance_score_srstrust:*")) # # Setup the csv writer # if not opts.output_file: output_file = open("new_binaries_with_netconns.csv", 'wb') else: output_file = open(opts.output_file, 'wb') csv_writer = csv.writer(output_file) # # Write out CSV header # csv_writer.writerow(("FileName", "Hostname", "Username", "Network Connections", "Process Link", "Binary Link", "Binary MD5", "Signature Status", "Company", "Observed Date", "Host Count", "Binary TimeStamp")) # # Create Progress Bar # pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(binary_query)).start() for i, binary in enumerate(binary_query): # # Update progress bar # pbar.update(i + 1) # # Retrieve the binary timestamp # binary_timestamp = time.asctime(time.gmtime(pefile.PE(data=binary.file.read()).FILE_HEADER.TimeDateStamp)) # # Build a sub query to see if this binary was executed and had netconns # sub_query = "process_md5:" + binary.md5 + " netconn_count:[1 TO *]" process_query = cb.select(Process).where(sub_query) # # Iterate through results # for process in process_query: # # Write out the result # try: csv_writer.writerow((process.path, process.hostname, process.username, process.netconn_count, process.webui_link, binary.webui_link, binary.md5, binary.signing_data.result if binary.signing_data.result else "UNSIGNED", binary.company_name, binary.server_added_timestamp, binary.host_count, binary_timestamp)) except Exception: print(binary) pbar.finish()
def perform_evaluation(trees, features, samples, n_iterations, n_classes, testing_algorithms): if n_classes > 2 and ("opencv_plain" in testing_algorithms or "opencv_adaboost" in testing_algorithms): raise Exception("OpenCV does not support more than two classes") permutations = list(product(*[trees, features, samples])) for (n_trees, n_features, n_samples) in permutations: steps = len(testing_algorithms) + 1 times = [[] for _ in testing_algorithms] scores = [[] for _ in testing_algorithms] if n_classes > 2: f1_average = "macro" else: f1_average = "binary" pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], maxval=steps * n_iterations).start() print(("Current parameters: n_trees=" + str(n_trees) + " n_samples=" + str(n_samples) + " n_features=" + str(n_features))) for index in range(n_iterations): current_step = 0 # Generate samples trainX, trainY = create_samples(n_samples, n_features, n_classes) testX, testY = create_samples(n_samples, n_features, n_classes) pbar.update(index * steps + current_step) current_step += 1 for algorithm in testing_algorithms: if algorithm == "fertilized_plain": fertilizedForest = setup_fertilized( n_trees, n_features, n_classes, 1, 2, "PLAIN", False) fertilizedTrainY = np.ascontiguousarray( np.atleast_2d(trainY).T) start = clock() fertilizedForest.fit(trainX, fertilizedTrainY) end = clock() times[current_step - 1].append(end - start) prediction = fertilizedForest.predict(testX) scores[current_step - 1].append( f1_score(testY, [np.argmax(x) for x in prediction], average=f1_average)) elif algorithm == "fertilized_samme": fertilizedForest = setup_fertilized( n_trees, n_features, n_classes, 1, 2, "SAMME", False) fertilizedTrainY = np.ascontiguousarray( np.atleast_2d(trainY).T) start = clock() fertilizedForest.fit(trainX, fertilizedTrainY) end = clock() times[current_step - 1].append(end - start) prediction = fertilizedForest.predict(testX) scores[current_step - 1].append( f1_score(testY, [np.argmax(x) for x in prediction], average=f1_average)) elif algorithm == "fertilized_samme_r": fertilizedForest = setup_fertilized( n_trees, n_features, n_classes, 1, 2, "SAMME.R", False) fertilizedTrainY = np.ascontiguousarray( np.atleast_2d(trainY).T) start = clock() fertilizedForest.fit(trainX, fertilizedTrainY) end = clock() times[current_step - 1].append(end - start) prediction = fertilizedForest.predict(testX) scores[current_step - 1].append( f1_score(testY, [np.argmax(x) for x in prediction], average=f1_average)) elif algorithm == "fertilized_adaboost": fertilizedForest = setup_fertilized( n_trees, n_features, n_classes, 1, 2, "ADABOOST", False) fertilizedTrainY = np.ascontiguousarray( np.atleast_2d(trainY).T) start = clock() fertilizedForest.fit(trainX, fertilizedTrainY) end = clock() times[current_step - 1].append(end - start) prediction = fertilizedForest.predict(testX) scores[current_step - 1].append( f1_score(testY, [np.argmax(x) for x in prediction], average=f1_average)) elif algorithm == "fertilized_samme_r_leafman": fertilizedForest = setup_fertilized( n_trees, n_features, n_classes, 1, 2, "SAMME.R", True) fertilizedTrainY = np.ascontiguousarray( np.atleast_2d(trainY).T) start = clock() fertilizedForest.fit(trainX, fertilizedTrainY) end = clock() times[current_step - 1].append(end - start) prediction = fertilizedForest.predict(testX) scores[current_step - 1].append( f1_score(testY, [np.argmax(x) for x in prediction], average=f1_average)) elif algorithm == "sklearn_plain": sklearnForest = setup_sklearn_randomForest( n_trees, n_features, 1, 2) start = clock() sklearnForest.fit(trainX, trainY) end = clock() times[current_step - 1].append(end - start) prediction = sklearnForest.predict(testX) scores[current_step - 1].append( f1_score(testY, prediction, average=f1_average)) elif algorithm == "sklearn_samme": sklearnBoost = setup_sklearn_adaBoost( n_trees, n_features, 1, 2, "SAMME") start = clock() sklearnBoost.fit(trainX, trainY) end = clock() times[current_step - 1].append(end - start) prediction = sklearnBoost.predict(testX) scores[current_step - 1].append( f1_score(testY, prediction, average=f1_average)) elif algorithm == "sklearn_samme_r": sklearnBoost = setup_sklearn_adaBoost( n_trees, n_features, 1, 2, "SAMME.R") start = clock() sklearnBoost.fit(trainX, trainY) end = clock() times[current_step - 1].append(end - start) prediction = sklearnBoost.predict(testX) scores[current_step - 1].append( f1_score(testY, prediction, average=f1_average)) elif algorithm == "opencv_plain": p = setup_opencv_rtrees(n_trees, n_features, 1, 2) opencvForest = RTrees() start = clock() opencvForest.train(trainX, CV_ROW_SAMPLE, trainY.astype("int32"), params=p) end = clock() times[current_step - 1].append(end - start) prediction = [ int(x) for x in [opencvForest.predict(x) for x in testX] ] scores[current_step - 1].append( f1_score(testY, prediction, average=f1_average)) elif algorithm == "opencv_adaboost": p = setup_opencv_boost(n_trees, 1) opencvBoost = Boost() start = clock() opencvBoost.train(trainX, CV_ROW_SAMPLE, trainY.astype("int32"), params=p) end = clock() times[current_step - 1].append(end - start) prediction = [ int(x) for x in [opencvBoost.predict(x) for x in testX] ] scores[current_step - 1].append( f1_score(testY, prediction, average=f1_average)) pbar.update(index * steps + current_step) current_step += 1 pbar.finish() output = [] for idx, alg in enumerate(testing_algorithms): curr = [alg, str(np.mean(scores[idx])), str(np.mean(times[idx]))] output.append(curr) table = tabulate(output, headers=["Algorithm", "F1 Score", "Fit Time in s"]) print(table)
def main(): global args args = parser.parse_args() cuda = args.cuda if cuda == 'true': cuda = True else: cuda = False task_name = args.task_name epoch_size = args.epoch_size batch_size = args.batch_size result_path = os.path.join(args.result_path, args.task_name, args.model_arch) model_path = os.path.join(args.model_path, args.task_name, args.model_arch) data, myidx, test_style_A, test_style_B = get_data() # 84*11 x 3 x 64 x 64 , 85*11 x 3 x 64 x 64, 14*11 x 3 x 64 x 64, 14*11 x 3 x 64 x 64 test = test_style_A + test_style_B if not args.task_name.startswith('car') and not args.task_name.endswith( 'car'): test_A = read_images(filenames=test, domain='A', image_size=args.image_size) test_B = read_images(filenames=test, domain='B', image_size=args.image_size) test_A = Variable(torch.FloatTensor(test_A), volatile=True) test_B = Variable(torch.FloatTensor(test_B), volatile=True) if not os.path.exists(result_path): os.makedirs(result_path) if not os.path.exists(model_path): os.makedirs(model_path) generator_A = Generator(extra_layers=True) generator_B = Generator(extra_layers=True) discriminator_ali = ad_Discriminator() discriminator_ReconA = ad_Discriminator_fm1() discriminator_ReconB = ad_Discriminator_fm1() if cuda: test_A = test_A.cuda() test_B = test_B.cuda() generator_A = generator_A.cuda() generator_B = generator_B.cuda() discriminator_ali = discriminator_ali.cuda() discriminator_ReconA = discriminator_ReconA.cuda() discriminator_ReconB = discriminator_ReconB.cuda() data_size = len(data) n_batches = (data_size // batch_size) recon_criterion = nn.MSELoss() gan_criterion = nn.BCELoss() #feat_criterion = nn.HingeEmbeddingLoss() feat_criterion = nn.MSELoss() spv_criterion = nn.MSELoss() gen_params = chain(generator_A.parameters(), generator_B.parameters()) dis_params = chain(discriminator_ali.parameters(), discriminator_ReconA.parameters(), discriminator_ReconB.parameters()) optim_gen = optim.Adam(gen_params, lr=args.learning_rate, betas=(0.5, 0.999), weight_decay=0.00001) optim_dis = optim.Adam(dis_params, lr=args.learning_rate, betas=(0.5, 0.999), weight_decay=0.00001) iters = 0 gen_loss_total = [] dis_loss_total = [] for epoch in range(epoch_size): _idx_A = list(range(len(data))) np.random.shuffle(_idx_A) _idx_B = list(range(len(data))) np.random.shuffle(_idx_B) data_A = np.array(data)[np.array(_idx_A)] data_B = np.array(data)[np.array(_idx_B)] widgets = ['epoch #%d|' % epoch, Percentage(), Bar(), ETA()] pbar = ProgressBar(maxval=n_batches, widgets=widgets) pbar.start() for i in range(n_batches): pbar.update(i) generator_A.zero_grad() generator_B.zero_grad() discriminator_ali.zero_grad() discriminator_ReconA.zero_grad() discriminator_ReconB.zero_grad() ############################################################################# un_spv _path_A = data_A[i * batch_size:(i + 1) * batch_size] _path_B = data_B[i * batch_size:(i + 1) * batch_size] A = read_images(filenames=_path_A, domain='A', image_size=args.image_size) B = read_images(filenames=_path_B, domain='B', image_size=args.image_size) A = Variable(torch.FloatTensor(A)) B = Variable(torch.FloatTensor(B)) if cuda: A = A.cuda() B = B.cuda() AB = generator_B(A) BA = generator_A(B) # Use discriminator to replace Reconstruction Loss ABA = generator_A(AB) BAB = generator_B(BA) A_t = torch.cat([A, A], 1).cuda() # 64 x 9 x 64 x 64 A_f = torch.cat([A, ABA], 1).cuda() # 64 x 9 x 64 x 64 ReconA_dis_real, ReconA_feats_real = discriminator_ReconA(A_t) ReconA_dis_fake, ReconA_feats_fake = discriminator_ReconA(A_f) dis_loss_ReconA, gen_loss_ReconA = get_gan_loss( ReconA_dis_real, ReconA_dis_fake, gan_criterion, cuda) fm_loss_ReconA = get_fm_loss(ReconA_feats_real, ReconA_feats_fake, feat_criterion) B_t = torch.cat([B, B], 1).cuda() # 64 x 9 x 64 x 64 B_f = torch.cat([B, BAB], 1).cuda() # 64 x 9 x 64 x 64 ReconB_dis_real, ReconB_feats_real = discriminator_ReconB(B_t) ReconB_dis_fake, ReconB_feats_fake = discriminator_ReconB(B_f) dis_loss_ReconB, gen_loss_ReconB = get_gan_loss( ReconB_dis_real, ReconB_dis_fake, gan_criterion, cuda) fm_loss_ReconB = get_fm_loss(ReconB_feats_real, ReconB_feats_fake, feat_criterion) # Real/Fake GAN Loss (A) tuple_1 = torch.cat([A, AB], 1).cuda() # 64 x 6 x 64 x 64 tuple_2 = torch.cat([BA, B], 1).cuda() # 64 x 6 x 64 x 64 dis_real, feats_real = discriminator_ali(tuple_1) dis_fake, feats_fake = discriminator_ali(tuple_2) dis_loss, gen_loss = get_ali_loss(dis_real, dis_fake, gan_criterion, cuda) fm_loss = get_fm_loss(feats_real, feats_fake, feat_criterion) if iters < args.gan_curriculum: rate = args.starting_rate else: rate = args.default_rate gen_loss_A_total = ((fm_loss * 0.9 + gen_loss * 0.1) * (1. - rate)) / 2.0 gen_loss_B_total = ((fm_loss * 0.9 + gen_loss * 0.1) * (1. - rate)) / 2.0 gen_loss_ReconA_total = (fm_loss_ReconB * 0.9 + gen_loss_ReconB * 0.1) * (1. - rate) / 2.0 gen_loss_ReconB_total = (fm_loss_ReconA * 0.9 + gen_loss_ReconA * 0.1) * (1. - rate) / 2.0 ############################################################################# if args.model_arch == 'adrec_edges2shoes_ali_nospv_fm': gen_loss = gen_loss_A_total + gen_loss_B_total + gen_loss_ReconA_total + gen_loss_ReconB_total dis_loss = dis_loss + dis_loss_ReconA + dis_loss_ReconB elif args.model_arch == 'gan': gen_loss = gen_loss_B dis_loss = dis_loss_B if iters % args.update_interval == 0: dis_loss.backward() optim_dis.step() else: gen_loss.backward() optim_gen.step() if iters % args.log_interval == 0: print("---------------------") print("GEN Loss:", as_np(gen_loss.mean()), as_np(gen_loss_ReconA.mean()), as_np(gen_loss_ReconB.mean())) print("DIS Loss:", as_np(dis_loss.mean()), as_np(dis_loss_ReconA.mean()), as_np(dis_loss_ReconB.mean())) if iters % args.image_save_interval == 0: AB = generator_B(test_A) BA = generator_A(test_B) ABA = generator_A(AB) BAB = generator_B(BA) n_testset = min(test_A.size()[0], test_B.size()[0]) subdir_path = os.path.join( result_path, str(iters / args.image_save_interval)) if os.path.exists(subdir_path): pass else: os.makedirs(subdir_path) for im_idx in range(n_testset): A_val = test_A[im_idx].cpu().data.numpy().transpose( 1, 2, 0) * 255. B_val = test_B[im_idx].cpu().data.numpy().transpose( 1, 2, 0) * 255. BA_val = BA[im_idx].cpu().data.numpy().transpose(1, 2, 0) * 255. ABA_val = ABA[im_idx].cpu().data.numpy().transpose( 1, 2, 0) * 255. AB_val = AB[im_idx].cpu().data.numpy().transpose(1, 2, 0) * 255. BAB_val = BAB[im_idx].cpu().data.numpy().transpose( 1, 2, 0) * 255. filename_prefix = os.path.join(subdir_path, str(im_idx)) scipy.misc.imsave(filename_prefix + '.A.jpg', A_val.astype(np.uint8)[:, :, ::-1]) scipy.misc.imsave(filename_prefix + '.B.jpg', B_val.astype(np.uint8)[:, :, ::-1]) scipy.misc.imsave(filename_prefix + '.BA.jpg', BA_val.astype(np.uint8)[:, :, ::-1]) scipy.misc.imsave(filename_prefix + '.AB.jpg', AB_val.astype(np.uint8)[:, :, ::-1]) scipy.misc.imsave(filename_prefix + '.ABA.jpg', ABA_val.astype(np.uint8)[:, :, ::-1]) scipy.misc.imsave(filename_prefix + '.BAB.jpg', BAB_val.astype(np.uint8)[:, :, ::-1]) if iters % args.model_save_interval == 0: torch.save( generator_A, os.path.join( model_path, 'model_gen_A-' + str(iters / args.model_save_interval))) torch.save( generator_B, os.path.join( model_path, 'model_gen_B-' + str(iters / args.model_save_interval))) torch.save( discriminator_ali, os.path.join( model_path, 'model_dis_ali-' + str(iters / args.model_save_interval))) #torch.save( discriminator_spv_A, os.path.join(model_path, 'model_dis_spv_A-' + str( iters / args.model_save_interval ))) #torch.save( discriminator_spv_B, os.path.join(model_path, 'model_dis_spv_B-' + str( iters / args.model_save_interval ))) iters += 1
def experiment_one(): model_dir = './models/mnf_lenet_mnist_fq2_fr2_usezTrue_thres0.5/model/' # pyx = tf.get_variable("pyx") # with tf.Session() as sess: # sess = tf.InteractiveSession() sess = tf.Session() mnist = MNIST() (xtrain, ytrain), (xvalid, yvalid), (xtest, ytest) = mnist.images() xtrain, xvalid, xtest = np.transpose(xtrain, [0, 2, 3, 1]), np.transpose( xvalid, [0, 2, 3, 1]), np.transpose(xtest, [0, 2, 3, 1]) ytrain, yvalid, ytest = to_categorical(ytrain, 10), to_categorical( yvalid, 10), to_categorical(ytest, 10) N, height, width, n_channels = xtrain.shape iter_per_epoch = N / 100 input_shape = [None, height, width, n_channels] x = tf.placeholder(tf.float32, input_shape, name='x') y_ = tf.placeholder(tf.float32, [None, 10], name='y_') model = MNFLeNet(N, input_shape=input_shape, flows_q=2, flows_r=2, use_z=False, learn_p=True, thres_var=0.5, flow_dim_h=50) tf.set_random_seed(1) np.random.seed(1) y = model.predict(x) yd = model.predict(x, sample=False) pyx = tf.nn.softmax(y) saver = tf.train.import_meta_graph(model_dir + 'mnf.meta') saver.restore(sess, tf.train.latest_checkpoint(model_dir)) # saver = tf.train.latest_checkpoint(model_dir + '**mnf**') # saver.restore(sess, model_dir + 'mnf.json') # saver.restore(sess, model_dir + 'mnf') all_vars = tf.get_collection('vars') for v in all_vars: v_ = sess.run(v) print("loaded") print '------------------------------------------------' print '- MNIST rotated -' data_path = '../../data/mnist/mnist_rotated.pkl' if os.path.exists(data_path): with open(data_path, 'rb') as f: data = pickle.load(f) X = data['X'] y = data['y'] else: # X, y = test_mnist_rot(plot=False) # save_mnist_to_file(X, y) pass X = X.reshape((X.shape[0], 1, 28, 28)) print X.shape X = np.transpose(X, [0, 2, 3, 1]) # X = X[:, np.newaxis, :, :] y = to_categorical(y, 10) print 'Data loaded' preds = np.zeros_like(y) widgets = ["Sampling |", Percentage(), Bar(), ETA()] pbar = ProgressBar(10, widgets=widgets) pbar.start() for i in xrange(10): pbar.update(i) for j in xrange(1): pyxi = sess.run(pyx, feed_dict={x: X[0:10]}) preds[0:10] += pyxi / 10 print sample_accuracy = np.mean(np.equal(np.argmax(preds, 1), np.argmax(y, 1))) print 'Sample test accuracy: {}'.format(sample_accuracy) print '------------------------------------------------'
def get_progressbar(maxval=10000): pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=maxval).start() return pbar
def train(self): config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: with tf.device("/gpu:%d" % cfg.GPU_ID): counter = self.build_model(sess) saver = tf.train.Saver(tf.all_variables(), keep_checkpoint_every_n_hours=2) # summary_op = tf.merge_all_summaries() summary_writer = tf.train.SummaryWriter(self.log_dir, sess.graph) keys = ["d_loss", "g_loss"] log_vars = [] log_keys = [] for k, v in self.log_vars: if k in keys: log_vars.append(v) log_keys.append(k) # print(k, v) generator_lr = cfg.TRAIN.GENERATOR_LR discriminator_lr = cfg.TRAIN.DISCRIMINATOR_LR num_embedding = cfg.TRAIN.NUM_EMBEDDING lr_decay_step = cfg.TRAIN.LR_DECAY_EPOCH number_example = self.dataset.train._num_examples updates_per_epoch = int(number_example / self.batch_size) epoch_start = int(counter / updates_per_epoch) for epoch in range(epoch_start, self.max_epoch): widgets = ["epoch #%d|" % epoch, Percentage(), Bar(), ETA()] pbar = ProgressBar(maxval=updates_per_epoch, widgets=widgets) pbar.start() if epoch % lr_decay_step == 0 and epoch != 0: generator_lr *= 0.5 discriminator_lr *= 0.5 all_log_vals = [] for i in range(updates_per_epoch): pbar.update(i) # training d images, wrong_images, embeddings, _, _ =\ self.dataset.train.next_batch(self.batch_size, num_embedding) feed_dict = {self.images: images, self.wrong_images: wrong_images, self.embeddings: embeddings, self.generator_lr: generator_lr, self.discriminator_lr: discriminator_lr } # train d feed_out = [self.discriminator_trainer, self.d_sum, self.hist_sum, log_vars] _, d_sum, hist_sum, log_vals = sess.run(feed_out, feed_dict) summary_writer.add_summary(d_sum, counter) summary_writer.add_summary(hist_sum, counter) all_log_vals.append(log_vals) # train g feed_out = [self.generator_trainer, self.g_sum] _, g_sum = sess.run(feed_out, feed_dict) summary_writer.add_summary(g_sum, counter) # save checkpoint counter += 1 if counter % self.snapshot_interval == 0: snapshot_path = "%s/%s_%s.ckpt" %\ (self.checkpoint_dir, self.exp_name, str(counter)) fn = saver.save(sess, snapshot_path) print("Model saved in file: %s" % fn) img_sum = self.epoch_sum_images(sess, cfg.TRAIN.NUM_COPY, epoch) summary_writer.add_summary(img_sum, counter) avg_log_vals = np.mean(np.array(all_log_vals), axis=0) dic_logs = {} for k, v in zip(log_keys, avg_log_vals): dic_logs[k] = v # print(k, v) log_line = "; ".join("%s: %s" % (str(k), str(dic_logs[k])) for k in dic_logs) print("Epoch %d | " % (epoch) + log_line) sys.stdout.flush() if np.any(np.isnan(avg_log_vals)): raise ValueError("NaN detected!")
updates_per_epoch = number_examples // cfg.TRAIN.BATCH_SIZE number_examples_val = val_dataset._num_examples updates_per_epoch_val = number_examples_val // cfg.TRAIN.BATCH_SIZE # Create one-hot answers dictionary to be used in prepare answers with open('./data/clver_rn/answer_to_ix.json', 'r') as answer_file: answer_to_ix = json.load(answer_file) answer_to_one_hot = {} one_hot_init_vector = [0] * len(answer_to_ix) # Set up the training loop for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() widgets = ['epoch #%d|' % epoch, Percentage(), Bar(), ETA()] pbar = ProgressBar(maxval=updates_per_epoch, widgets=widgets) pbar.start() # Call the train and the test step for the dataset epoch_loss, epoch_accuracy = train(updates_per_epoch) log_line_train = '%s: %0.4f; %s: %0.4f; ' % ( "Training Loss", epoch_loss, "Training Accuracy", epoch_accuracy) val_accuracy = val(updates_per_epoch_val) log_line_val = '%s: %0.4f ' % ("Validation Accuracy", val_accuracy) epoch_end_time = time.time() time_taken = epoch_end_time - epoch_start_time log_time_line = '%s: %0.4f' % ("Time taken for the current epoch", time_taken) sys.stdout.flush() print("Epoch %d | " % (epoch) + log_line_train + log_line_val +
filenames = ["Stanford_Online_Products.zip"] urls = [base_url + f for f in filenames] fuel_data_path = os.path.join(fuel_root_path, "online_products") os.mkdir(fuel_data_path) for filename in filenames: url = base_url + filename filepath = os.path.join(fuel_data_path, filename) with contextlib.closing(request.urlopen(url)) as f: expected_filesize = int(f.headers["content-length"]) print(expected_filesize) time.sleep(5) widgets = ['{}: '.format(filename), Percentage(), ' ', Bar(), ' ', ETA(), ' ', FileTransferSpeed()] progress_bar = ProgressBar(widgets=widgets, maxval=expected_filesize).start() def reporthook(count, blockSize, totalSize): progress_bar.update(min(count*blockSize, totalSize)) request.urlretrieve(url, filepath, reporthook=reporthook) progress_bar.finish() downloaded_filesize = os.path.getsize(filepath) assert expected_filesize == downloaded_filesize, " ".join(( "expected file size is {}, but the actual size of the downloaded file", "is {}.")).format(expected_filesize, downloaded_filesize)
def generate_template_dtree(compute_function, classname, pool, outfile, doc, login, align='left', MathJax=False): # TODO: Support for right align in 'parent' functions from parampool.generator.flask.latex_symbols import \ get_symbol, symbols_same_size import inspect args = inspect.getargspec(compute_function).args app_dir = outfile.split("templates")[0] static_dir = os.path.join(app_dir, "static") compute_function_name = compute_function.__name__ pre_code = """\ <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8" /> <title>Django %(compute_function_name)s app</title> <link rel="StyleSheet" href="static/dtree.css" type="text/css" /> <script type="text/javascript" src="static/dtree.js"></script> </head> <body> """ % vars() if MathJax: pre_code += ''' <script type="text/x-mathjax-config"> MathJax.Hub.Config({ TeX: { equationNumbers: { autoNumber: "AMS" }, extensions: ["AMSmath.js", "AMSsymbols.js", "autobold.js"] } }); </script> <script type="text/javascript" src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"> </script> <!-- Fix slow MathJax rendering in IE8 --> <meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7"> ''' if login: pre_code += '''\ {% if user.is_anonymous %} <p align="right"><a href="/login">Login</a> / <a href="/reg">Register</a></p> {% else %} <p align="right">Logged in as {{user}}<br><a href="/old">Previous simulations</a><br><a href="/logout">Logout</a></p> {% endif %} ''' pre_code += """\ %(doc)s <!-- Input and Results are typeset as a two-column table --> <table> <tr> <td valign="top"> <h2>Input:</h2> <div class="dtree"> <p><a href="javascript: d.openAll();">open all</a> | <a href="javascript: d.closeAll();">close all</a></p> <form method=post action="" enctype="multipart/form-data">{%% csrf_token %%} <script type="text/javascript"> d = new dTree('d'); """ % vars() post_code = """\ document.write(d); </script> </div> <p><input type="submit" value="Compute"></form></p> </td> <td valign="top"> {% if result != None %} <h2>Result:</h2> {{ result|safe }} """ if login: post_code += ''' {% if not user.is_anonymous %} <h3>Comments:</h3> <form method=post action="/add_comment/">{% csrf_token %} <textarea name="comments" rows="4" cols="40"></textarea> <p><input type="submit" value="Add"> </form> {% endif %} ''' post_code += ''' {% endif %} </td> </tr> </table> </body> </html>''' def leaf_func(tree_path, level, item, user_data): id = user_data.id parent_id = user_data.parent_id[-1] name = item.name field_name = parampool.utils.legal_variable_name(name) form = """\ {%% spaceless %%} {{ form.%(field_name)s }} {%% endspaceless %%} \ {%% if form.%(field_name)s.errors %%} \ {%% for error in form.%(field_name)s.errors %%} \ <err> {{error}} </err> \ {%% endfor %%}{%% endif %%} """ % vars() # (Note: need the spaceless trick to ensure that the resulting HTML # code is on one line for select widgets in Django (strange behavior)) if hasattr(user_data, 'pb'): user_data.pb.update(user_data.pbid) user_data.pbid += 1 if 'symbol' in item.data: symbol = item.data["symbol"] else: symbol = "\\mbox{%s}" % name imgsrc = get_symbol(symbol, static_dir, tree_path) imgsrc = os.sep + "static" + imgsrc.split("static")[-1] # Use slider and show current value if item.data.get("widget", None) in ("range", "integer_range"): showvalue = ' <span id="range"></span>' else: showvalue = "" # Make label label = [] if 'help' in item.data: label.append(item.data['help']) if 'unit' in item.data: label.append('Unit: ' + item.data['unit']) label = ' '.join(label) if align == "right": line = '%(form)s<img src="%(imgsrc)s" height="18" />' % vars() line += showvalue user_data.code += """\ d.add(%(id)i, %(parent_id)i, '%(line)s', '#', '%(label)s'); """ % vars() else: form += showvalue user_data.code += """\ d.add(%(id)i, %(parent_id)i, '%(form)s', '#', '%(label)s', '', '%(imgsrc)s'); """ % vars() user_data.id += 1 def subtree_start_func(tree_path, level, item, user_data): id = user_data.id parent_id = user_data.parent_id[-1] name = item.name user_data.code += """\ d.add(%(id)i, %(parent_id)i, '%(name)s'); """ % vars() user_data.parent_id.append(user_data.id) user_data.id += 1 def subtree_end_func(tree_path, level, item, user_data): del user_data.parent_id[-1] class CodeData: """Object to hold output code through tree recursion.""" id = 0 pbid = 0 parent_id = [-1] codedata = CodeData() codedata.code = pre_code # Display a progressbar if we have many data items pool.update() num_widgets = len(args) if pool is None else len(pool.paths2data_items) display_progressbar = num_widgets >= 10 if display_progressbar: from progressbar import \ ProgressBar, Percentage, Bar, ETA, RotatingMarker widgets = [ 'Generating: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA() ] pb = ProgressBar(widgets=widgets, maxval=num_widgets - 1).start() codedata.pb = pb pool.traverse(callback_leaf=leaf_func, callback_subtree_start=subtree_start_func, callback_subtree_end=subtree_end_func, user_data=codedata, verbose=False) if display_progressbar: pb.finish() code = codedata.code + post_code symbols_same_size(static_dir) if outfile is None: return code else: f = open(outfile, 'w') f.write(code) f.close()
def __init__(self, maxval=0): widgets = [Percentage(), ' ', Bar(marker='=', left='[', right=']'), ' ', ETA()] super(ProgressBarContext, self).__init__(widgets=widgets, maxval=maxval, fd=sys.stdout)
pbar = ProgressBar(maxval=len(masterscrape), term_width=100).start() while not result.ready(): pbar.update(len(masterscrape) - result._number_left) time.sleep(1) pbar.finish() full_masterscrape = filter(None, result.get()) pool2.close() pool2.join() got = [] gc.disable() print("Validating and deduping links") pbar = ProgressBar(widgets=[Percentage(), Bar(), ETA()], maxval=100, term_width=100).start() #this loop is slow and needs to be faster. wtf. for idx, n in enumerate(full_masterscrape): linkhost = n[0] appendable = [[linkhost, actual_link] for actual_link in n[1] if actual_link not in dontvisit] masterappend = [(linkhost, actual_link) for actual_link in n[1]] dontvisitappend = [ actual_link for actual_link in n[1] if actual_link not in dontvisit ]
args = parser.parse_args() input_files = args.infiles if not (args.parity == 'even' or args.parity == 'odd'): raise ValueError("parity argument must be 'even' or 'odd', not %s" % args.parity) logging.info("loading trees...") in_tree = ROOT.TChain(args.treepath) for i in input_files: in_tree.Add(i) new_file = ROOT.TFile(args.outputfile, "recreate") new_tree = in_tree.CloneTree(0) entries = in_tree.GetEntries() progress = ProgressBar(widgets=[ETA(), Bar('>')], maxval=entries).start() for i, row in enumerate(in_tree): progress.update(i + 1) if args.parity == 'even' and (i % 2) == 0: in_tree.GetEntry(i) new_tree.Fill() elif args.parity == 'odd' and (i % 2) == 1: in_tree.GetEntry(i) new_tree.Fill() new_tree.AutoSave() del in_tree del new_file
from django.conf import settings from django.core.management.base import BaseCommand, CommandError # other constants OUI_LIST_URL = 'http://standards.ieee.org/develop/regauth/oui/oui.txt' OUI_LIST_FILE = 'oui.txt' OUI_RE = re_compile(r'^\s+([0-9A-F]{6})\s+\(base 16\)\s+(.*)\s*$') IP4P_LIST_URL = \ 'http://www.iana.org/assignments/protocol-numbers/protocol-numbers.xml' IP4P_LIST_FILE = 'protocol-numbers.xml' HELPER_DATA = join(realpath(dirname(__file__)), 'scraper.dat') UA = 'tollgate/%s (scraper.py; Python)' % __version__ PBAR_WIDGET_STYLE = [Percentage(), Bar(), ETA()] def download_file(filename, url): etag_filename = filename + '.etag' # check to see if there's an existing dump of the data mtime = None if exists(filename): if not isfile(filename): raise Exception( ('ERROR: %s exists but is not a file. Please check this, and' + 'move it out of the way so I can run.') % filename) # lets also check for an etag, and use it if it's there. etag = None if exists(etag_filename):
origin = Vec3(0, 0, 0) world = HitableList() world.append(Sphere(Vec3(0, 0, -1), 0.5, Lambertian(Vec3(0.8, 0.3, 0.3)))) world.append( Sphere(Vec3(0, -100.5, -1), 100, Lambertian(Vec3(0.8, 0.8, 0.0)))) world.append( Sphere(Vec3(1, 0, -1), 0.5, Metal(Vec3(0.8, 0.6, 0.2), fuzz=0.3))) world.append( Sphere(Vec3(-1, 0, -1), 0.5, Metal(Vec3(0.8, 0.8, 0.8), fuzz=1.0))) cam = Camera() pbar = ProgressBar( widgets=['Percentage ', Percentage(), ' ', ETA(), ' ', Bar()], maxval=nx * ny).start() with open('image.ppm', 'w') as f: f.write('P3\n{} {}\n255\n'.format(nx, ny)) for y, j in enumerate(xrange(ny - 1, -1, -1)): for i in xrange(nx): col = Vec3(0, 0, 0) for _ in xrange(ns): u = float(i + random()) / nx v = float(j + random()) / ny r = cam.get_ray(u, v) p = r.point_at_parameter(2.0) col += color(r, world, depth=0) col /= float(ns)
def train(self): config = tf.ConfigProto(allow_soft_placement=True) with tf.Session(config=config) as sess: with tf.device("/gpu:%d" % cfg.GPU_ID): counter = self.build_model(sess) saver = tf.train.Saver(tf.all_variables(), keep_checkpoint_every_n_hours=5) summary_writer = tf.summary.FileWriter(self.log_dir, sess.graph) if cfg.TRAIN.FINETUNE_LR: keys = ["hr_d_loss", "hr_g_loss", "d_loss", "g_loss"] else: keys = ["d_loss", "g_loss"] log_vars = [] log_keys = [] for k, v in self.log_vars: if k in keys: log_vars.append(v) log_keys.append(k) generator_lr = cfg.TRAIN.GENERATOR_LR discriminator_lr = cfg.TRAIN.DISCRIMINATOR_LR lr_decay_step = cfg.TRAIN.LR_DECAY_EPOCH number_example = self.dataset.train._num_examples updates_per_epoch = int(number_example / self.batch_size) # int((counter + lr_decay_step/2) / lr_decay_step) decay_start = cfg.TRAIN.PRETRAINED_EPOCH epoch_start = int(counter / updates_per_epoch) for epoch in range(epoch_start, self.max_epoch): widgets = [ "epoch #%d|" % epoch, Percentage(), Bar(), ETA() ] pbar = ProgressBar(maxval=updates_per_epoch, widgets=widgets) pbar.start() if epoch % lr_decay_step == 0 and epoch > decay_start: generator_lr *= 0.5 discriminator_lr *= 0.5 all_log_vals = [] for i in range(updates_per_epoch): pbar.update(i) log_vals = self.train_one_step(generator_lr, discriminator_lr, counter, summary_writer, log_vars, sess) all_log_vals.append(log_vals) # save checkpoint counter += 1 if counter % self.snapshot_interval == 0: snapshot_path = "%s/%s_%s.ckpt" %\ (self.checkpoint_dir, self.exp_name, str(counter)) fn = saver.save(sess, snapshot_path) print("Model saved in file: %s" % fn) img_summary, img_summary2 = self.epoch_sum_images( sess, cfg.TRAIN.NUM_COPY) summary_writer.add_summary(img_summary, counter) summary_writer.add_summary(img_summary2, counter) avg_log_vals = np.mean(np.array(all_log_vals), axis=0) dic_logs = {} for k, v in zip(log_keys, avg_log_vals): dic_logs[k] = v log_line = "; ".join("%s: %s" % (str(k), str(dic_logs[k])) for k in dic_logs) print("Epoch %d | " % (epoch) + log_line) sys.stdout.flush() if np.any(np.isnan(avg_log_vals)): raise ValueError("NaN detected!")
def main(): parser = argparse.ArgumentParser() parser.add_argument('source_path', default="./CCTV_News.mp4", help="Path to the video or audio file to subtitle", nargs='?') parser.add_argument('-C', '--concurrency', help="Number of concurrent API requests to make", type=int, default=10) parser.add_argument( '-o', '--output', help="Output path for subtitles (by default, subtitles are saved in \ the same directory and name as the source path)") parser.add_argument('-F', '--format', help="Destination subtitle format", default="srt") parser.add_argument('-S', '--src-language', help="Language spoken in source file", default="zh-CN") parser.add_argument('-D', '--dst-language', help="Desired language for the subtitles", default="zh-CN") parser.add_argument( '-K', '--api-key', help= "The Google Translate API key to be used. (Required for subtitle translation)" ) parser.add_argument('--list-formats', help="List all available subtitle formats", action='store_true') parser.add_argument('--list-languages', help="List all available source/destination languages", action='store_true') args = parser.parse_args() if args.list_formats: print("List of formats:") for subtitle_format in FORMATTERS.keys(): print("{format}".format(format=subtitle_format)) return 0 if args.list_languages: print("List of all languages:") for code, language in sorted(LANGUAGE_CODES.items()): print("{code}\t{language}".format(code=code, language=language)) return 0 if args.format not in FORMATTERS.keys(): print( "Subtitle format not supported. Run with --list-formats to see all supported formats." ) return 1 if args.src_language not in LANGUAGE_CODES.keys(): print( "Source language not supported. Run with --list-languages to see all supported languages." ) return 1 if args.dst_language not in LANGUAGE_CODES.keys(): print( "Destination language not supported. Run with --list-languages to see all supported languages." ) return 1 if not args.source_path: print("Error: You need to specify a source path.") return 1 audio_filename, audio_rate = extract_audio(args.source_path) regions = find_speech_regions(audio_filename) pool = multiprocessing.Pool(args.concurrency) converter = WAVConverter(source_path=audio_filename, slicenum=len(regions)) if regions: try: widgets = [ "Converting speech regions to WAVC files: ", Percentage(), ' ', Bar(), ' ', ETA() ] pbar = ProgressBar(widgets=widgets, maxval=len(regions)).start() extracted_regions = [] for i, extracted_region in enumerate(pool.imap(converter, regions)): extracted_regions.append(extracted_region) pbar.update(i) pbar.finish() os.remove(audio_filename) wavlist = create_manifest(os.getcwd() + '/temp', os.getcwd() + '/temp' + '/wavlist.txt') transcripts = infer.infer_interface(wavlist, len(extracted_regions)) except KeyboardInterrupt: pbar.finish() pool.terminate() pool.join() print("Cancelling transcription") return 1 timed_subtitles = [(r, t) for r, t in zip(regions, transcripts) if t] formatter = FORMATTERS.get(args.format) formatted_subtitles = formatter(timed_subtitles) dest = args.output if not dest: base, ext = os.path.splitext(args.source_path) dest = "{base}.{format}".format(base=base, format=args.format) with open(dest, 'wb') as f: f.write(formatted_subtitles.encode("utf-8")) print("Subtitles file created at {}".format(dest)) shutil.rmtree('temp') return 0
def _progress(iterable): if ProgressBar: pbar = ProgressBar(widgets=[SimpleProgress(), Bar(), ETA()]) else: pbar = iter return pbar(iterable)
def __init__(self,analysis,sample,**kwargs): # default to access via sample/analysis self.analysis = analysis self.sample = sample self.shift = kwargs.pop('shift','') self.skipHists = kwargs.pop('skipHists',False) self.isData = isData(self.sample) self.intLumi = kwargs.get('intLumi',float(getLumi())) logging.debug('Initializing {0} {1} {2}'.format(self.analysis,self.sample,self.shift)) # backup passing custom parameters self.ntupleDirectory = kwargs.pop('ntupleDirectory','{0}/{1}'.format(getNtupleDirectory(self.analysis,shift=self.shift),self.sample)) self.inputFileList = kwargs.pop('inputFileList','') self.outputFile = kwargs.pop('outputFile',getNewFlatHistograms(self.analysis,self.sample,shift=self.shift)) if os.path.dirname(self.outputFile): python_mkdir(os.path.dirname(self.outputFile)) self.treeName = kwargs.pop('treeName',getTreeName(self.analysis)) if hasProgress: self.pbar = kwargs.pop('progressbar',ProgressBar(widgets=['{0}: '.format(sample),' ',SimpleProgress(),' ',Percentage(),' ',Bar(),' ',ETA()])) else: self.pbar = None # get stuff needed to flatten self.infile = 0 self.tchain = 0 self.initialized = False self.hists = {} self.datasets = {}
def main(argv): ################## #These change a lot numWaveforms = 30 numThreads = 12 ndim = 6 * numWaveforms + 8 nwalkers = 4 * ndim iter = 5000 burnIn = 4000 wfPlotNumber = 100 ###################### # plt.ion() fitSamples = 200 #Prepare detector zero_1 = -5.56351644e+07 pole_1 = -1.38796386e+04 pole_real = -2.02559385e+07 pole_imag = 9885315.37450211 zeros = [zero_1, 0] poles = [pole_real + pole_imag * 1j, pole_real - pole_imag * 1j, pole_1] system = signal.lti(zeros, poles, 1E7) tempGuess = 77.89 gradGuess = 0.0483 pcRadGuess = 2.591182 pcLenGuess = 1.613357 #Create a detector model detName = "conf/P42574A_grad%0.2f_pcrad%0.2f_pclen%0.2f.conf" % (0.05, 2.5, 1.65) det = Detector(detName, temperature=tempGuess, timeStep=1., numSteps=fitSamples * 10, tfSystem=system) det.LoadFields("P42574A_fields_v3.npz") det.SetFields(pcRadGuess, pcLenGuess, gradGuess) tempIdx = -8 gradIdx = -7 pcRadIdx = -6 pcLenIdx = -5 #and the remaining 4 are for the transfer function fig_size = (20, 10) #Create a decent start guess by fitting waveform-by-waveform wfFileName = "P42574A_512waveforms_%drisetimeculled.npz" % numWaveforms if os.path.isfile(wfFileName): data = np.load(wfFileName) results = data['results'] wfs = data['wfs'] numWaveforms = wfs.size else: print "No saved waveforms available. Loading from Data" exit(0) #prep holders for each wf-specific param r_arr = np.empty(numWaveforms) phi_arr = np.empty(numWaveforms) z_arr = np.empty(numWaveforms) scale_arr = np.empty(numWaveforms) t0_arr = np.empty(numWaveforms) smooth_arr = np.ones(numWaveforms) * 7. simWfArr = np.empty((1, numWaveforms, fitSamples)) #Prepare the initial value arrays for (idx, wf) in enumerate(wfs): wf.WindowWaveformTimepoint(fallPercentage=.99) r_arr[idx], phi_arr[idx], z_arr[idx], scale_arr[idx], t0_arr[ idx], smooth_arr[idx] = results[idx]['x'] t0_arr[ idx] += 10 #because i had a different windowing offset back in the day #Plot the waveforms to take a look at the initial guesses if False: fig = plt.figure() for (idx, wf) in enumerate(wfs): print "WF number %d:" % idx print " >>r: %f\n >>phi %f\n >>z %f\n >>e %f\n >>t0 %f\n >>smooth %f" % ( r_arr[idx], phi_arr[idx], z_arr[idx], scale_arr[idx], t0_arr[idx], smooth_arr[idx]) ml_wf = det.GetSimWaveform(r_arr[idx], phi_arr[idx], z_arr[idx], scale_arr[idx] * 100, t0_arr[idx], fitSamples, smoothing=smooth_arr[idx]) plt.plot(ml_wf, color="b") plt.plot(wf.windowedWf, color="r") value = raw_input(' --> Press q to quit, any other key to continue\n') if value == 'q': exit(0) #Initialize the multithreading p = Pool(numThreads, initializer=initializeDetectorAndWaveforms, initargs=[det, wfs]) initializeDetectorAndWaveforms(det, wfs) #Do the MCMC mcmc_startguess = np.hstack(( r_arr[:], phi_arr[:], z_arr[:], scale_arr[:] * 100., t0_arr[:], smooth_arr[:], # waveform-specific params tempGuess, gradGuess, pcRadGuess, pcLenGuess, zero_1, pole_1, pole_real, pole_imag)) # detector-specific #number of walkers _must_ be even if nwalkers % 2: nwalkers += 1 #Initialize walkers with a random, narrow ball around the start guess pos0 = [ mcmc_startguess + 1e-2 * np.random.randn(ndim) * mcmc_startguess for i in range(nwalkers) ] #Make sure everything in the initial guess is within bounds for pos in pos0: pos[:numWaveforms] = np.clip(pos[:numWaveforms], 0, np.floor(det.detector_radius * 10.) / 10.) pos[numWaveforms:2 * numWaveforms] = np.clip( pos[numWaveforms:2 * numWaveforms], 0, np.pi / 4) pos[2 * numWaveforms:3 * numWaveforms] = np.clip( pos[2 * numWaveforms:3 * numWaveforms], 0, np.floor(det.detector_length * 10.) / 10.) pos[4 * numWaveforms:5 * numWaveforms] = np.clip( pos[4 * numWaveforms:5 * numWaveforms], 0, fitSamples) pos[5 * numWaveforms:6 * numWaveforms] = np.clip( pos[5 * numWaveforms:6 * numWaveforms], 0, 20.) pos[tempIdx] = np.clip(pos[tempIdx], 40, 120) pos[gradIdx] = np.clip(pos[gradIdx], det.gradList[0], det.gradList[-1]) pos[pcRadIdx] = np.clip(pos[pcRadIdx], det.pcRadList[0], det.pcRadList[-1]) pos[pcLenIdx] = np.clip(pos[pcLenIdx], det.pcLenList[0], det.pcLenList[-1]) prior = lnprior(pos, ) if not np.isfinite(prior): print "BAD PRIOR WITH START GUESS YOURE KILLING ME SMALLS" print pos exit(0) #Initialize, run the MCMC sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, pool=p) #w/ progress bar, & time the thing bar = ProgressBar(widgets=[Percentage(), Bar()], maxval=iter).start() start = timer() for (idx, result) in enumerate( sampler.sample(pos0, iterations=iter, storechain=True)): bar.update(idx + 1) end = timer() bar.finish() print "Elapsed time: " + str(end - start) print "Dumping chain to file..." np.save("sampler_%dwfs.npy" % numWaveforms, sampler.chain) print "Making MCMC steps figure..." ######### Plots for Waveform params stepsFig = plt.figure(2, figsize=fig_size) plt.clf() ax0 = stepsFig.add_subplot(611) ax1 = stepsFig.add_subplot(612, sharex=ax0) ax2 = stepsFig.add_subplot(613, sharex=ax0) ax3 = stepsFig.add_subplot(614, sharex=ax0) ax4 = stepsFig.add_subplot(615, sharex=ax0) ax5 = stepsFig.add_subplot(616, sharex=ax0) ax0.set_ylabel('r') ax1.set_ylabel('phi') ax2.set_ylabel('z') ax3.set_ylabel('scale') ax4.set_ylabel('t0') ax5.set_ylabel('smoothing') for i in range(nwalkers): for j in range(wfs.size): ax0.plot(sampler.chain[i, :, 0 + j], alpha=0.3) # r ax1.plot(sampler.chain[i, :, numWaveforms + j], alpha=0.3) # phi ax2.plot(sampler.chain[i, :, 2 * numWaveforms + j], alpha=0.3) #z ax3.plot(sampler.chain[i, :, 3 * numWaveforms + j], alpha=0.3) #energy ax4.plot(sampler.chain[i, :, 4 * numWaveforms + j], alpha=0.3) #t0 ax5.plot(sampler.chain[i, :, 5 * numWaveforms + j], alpha=0.3) #smoothing plt.savefig("emcee_wfchain_%dwfs.png" % numWaveforms) ######### Plots for Detector params stepsFigDet = plt.figure(3, figsize=fig_size) plt.clf() ax0 = stepsFigDet.add_subplot(411) ax1 = stepsFigDet.add_subplot(412, sharex=ax0) ax2 = stepsFigDet.add_subplot(413, sharex=ax0) ax3 = stepsFigDet.add_subplot(414, sharex=ax0) ax0.set_ylabel('temp') ax1.set_ylabel('grad') ax2.set_ylabel('pcRad') ax3.set_ylabel('pcLen') for i in range(nwalkers): ax0.plot(sampler.chain[i, :, tempIdx], "b", alpha=0.3) #temp ax1.plot(sampler.chain[i, :, gradIdx], "b", alpha=0.3) #grad ax2.plot(sampler.chain[i, :, pcRadIdx], "b", alpha=0.3) #pcrad ax3.plot(sampler.chain[i, :, pcLenIdx], "b", alpha=0.3) #pclen plt.savefig("emcee_detchain_%dwfs.png" % numWaveforms) #and for the transfer function stepsFigTF = plt.figure(4, figsize=fig_size) plt.clf() tf0 = stepsFigTF.add_subplot(411) tf1 = stepsFigTF.add_subplot(412, sharex=ax0) tf2 = stepsFigTF.add_subplot(413, sharex=ax0) tf3 = stepsFigTF.add_subplot(414, sharex=ax0) tf0.set_ylabel('zero_1') tf1.set_ylabel('pole_1') tf2.set_ylabel('pole_real') tf3.set_ylabel('pole_imag') for i in range(nwalkers): tf0.plot(sampler.chain[i, :, -4], "b", alpha=0.3) #2 tf1.plot(sampler.chain[i, :, -3], "b", alpha=0.3) #den1 tf2.plot(sampler.chain[i, :, -2], "b", alpha=0.3) #2 tf3.plot(sampler.chain[i, :, -1], "b", alpha=0.3) #3 plt.savefig("emcee_tfchain_%dwfs.png" % numWaveforms) samples = sampler.chain[:, burnIn:, :].reshape((-1, ndim)) print "temp is %f" % np.median(samples[:, tempIdx]) print "grad is %f" % np.median(samples[:, gradIdx]) print "pcrad is %f" % np.median(samples[:, pcRadIdx]) print "pclen is %f" % np.median(samples[:, pcLenIdx]) print "zero_1 is %f" % np.median(samples[:, -4]) print "pole_1 is %f" % np.median(samples[:, -3]) print "pole_real is %f" % np.median(samples[:, -2]) print "pole_imag is %f" % np.median(samples[:, -1]) #TODO: Aaaaaaand plot some waveforms.. simWfs = np.empty((wfPlotNumber, numWaveforms, fitSamples)) for idx, (theta) in enumerate(samples[np.random.randint( len(samples), size=wfPlotNumber)]): temp, impGrad, pcRad, pcLen = theta[tempIdx], theta[gradIdx], theta[ pcRadIdx], theta[pcLenIdx] zero_1, pole_1, pole_real, pole_imag = theta[-4:] r_arr, phi_arr, z_arr, scale_arr, t0_arr, smooth_arr = theta[:-8].reshape( (6, numWaveforms)) det.SetTemperature(temp) det.SetFields(pcRad, pcLen, impGrad) zeros = [zero_1, 0] poles = [ pole_real + pole_imag * 1j, pole_real - pole_imag * 1j, pole_1 ] det.SetTransferFunction(zeros, poles, 1E7) for wf_idx in range(wfs.size): wf_i = det.GetSimWaveform(r_arr[wf_idx], phi_arr[wf_idx], z_arr[wf_idx], scale_arr[wf_idx], t0_arr[wf_idx], fitSamples) simWfs[idx, wf_idx, :] = wf_i if wf_i is None: print "Waveform %d, %d is None" % (idx, wf_idx) residFig = plt.figure(4, figsize=(20, 15)) helpers.plotManyResidual(simWfs, wfs, figure=residFig) plt.savefig("emcee_waveforms_%dwfs.png" % numWaveforms)
# Bind p(x, z) and q(z | x) to the same placeholder for x. data = {x: x_ph} inference = ed.ReparameterizationKLKLqp({z: qz}, data) optimizer = tf.train.AdamOptimizer(0.01, epsilon=1.0) inference.initialize(optimizer=optimizer, use_prettytensor=True) init = tf.global_variables_initializer() init.run() n_epoch = 100 n_iter_per_epoch = 1000 for epoch in range(n_epoch): avg_loss = 0.0 widgets = ["epoch #%d|" % epoch, Percentage(), Bar(), ETA()] pbar = ProgressBar(n_iter_per_epoch, widgets=widgets) pbar.start() for t in range(n_iter_per_epoch): pbar.update(t) x_train, _ = mnist.train.next_batch(M) info_dict = inference.update(feed_dict={x_ph: x_train}) avg_loss += info_dict['loss'] # Print a lower bound to the average marginal likelihood for an # image. avg_loss = avg_loss / n_iter_per_epoch avg_loss = avg_loss / M print("log p(x) >= {:0.3f}".format(avg_loss)) # Visualize hidden representations.
time = odata_pt[:, 1] tindex = abs(time - pf.current_time.v).argmin() if args.subsample >= 0 and pf.h.max_level - args.undersample < args.subsample: print 'ERROR: Subsample must be less than max refine level - undersample.' sys.exit() maxval = np.empty(len(args.vars)) minval = np.empty(len(args.vars)) maxval.fill(-float("inf")) minval.fill(float("inf")) vals = list() pbar = ProgressBar(widgets=[ 'Determining histogram bounds and initial pass of data: ', Percentage(), Bar(), ' ', ETA() ], maxval=len(pf.index.grids)).start() for cnt, g in enumerate(pf.index.grids): if g.Level > pf.h.max_level - args.undersample: continue if len(g.Children ) != 0 and g.Level != pf.h.max_level - args.undersample: continue evals = list() vvals = list() #vvals = g.get_data(args.var).ravel() for e, ev in enumerate(args.vars): vvals.append(g[ev].ravel()) dvals = g["dens"].ravel().v * g["o16 "].ravel().v
def main(arguments): """ Main method of the file. Args: arguments: ArgumentParser object which contains user-configurable parameters. For more info, look into parseArguments() method. Returns: None """ # Download data, if asked by user. if arguments.download_data: # Create data folder, if it doesn't exist. if not os.path.exists(os.path.join("data")): os.makedirs(os.path.join("data")) # Define URL to get data from, # and the zip folder path where the data will be stored. url = "http://cs231n.stanford.edu/tiny-imagenet-200.zip" zipFolder = os.path.join("data", "tiny-imagenet-200.zip") """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # Define a method which will unzip the downloaded zip folder. def unzip(): print("Unzipping data...") zipFile = ZipFile(zipFolder, "r") uncompressedSize = sum(file.file_size for file in zipFile.infolist()) extractedSize = 0 pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=100).start() start = datetime.datetime.now() for file in zipFile.infolist(): extractedSize += file.file_size percent = extractedSize * 100 / uncompressedSize pbar.update(percent) zipFile.extract(file, path=zipFolder.rsplit(".", 1)[0]) print("Unzipped in {} s.".format((datetime.datetime.now() - start).seconds)) zipFile.close() """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # Proceed to download only if the tiny imagenet folder does not already exist. if (not os.path.exists(os.path.join("data", "tiny-imagenet-200"))): # Proceed to download only if the downloaded zip folder does not exist, # else directly unzip the previously downloaded zip file. if (not os.path.isfile(zipFolder)): print("Retrieving dataset from web...") pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=100).start() def dlProgress(count, blockSize, totalSize): percent = int(count * blockSize * 100 / totalSize) pbar.update(percent) start = datetime.datetime.now() urlretrieve(url, zipFolder, reporthook=dlProgress) print("Downloaded in {} s.".format((datetime.datetime.now() - start).seconds)) unzip() reconfigureValSet(arguments.data_dir) else: print("Dataset folder already exists.") # Define image parameters. imgWidth = 64 # width of image imgHeight = 64 # height of image imgChannels = 3 # channels of image, RGB lenTrainData = 100000 # total number of training data lenValData = 10000 # total number of validation data classes = 200 # number of classes # Define image shape. if (K.image_data_format() == "channels_first"): imgShape = (imgChannels, imgWidth, imgHeight) else: imgShape = (imgWidth, imgHeight, imgChannels) # Create Model.' network = arguments.model if (network == "vgg16"): batchSize = 256 numEpochs = 2000 # 74 epochs used in the original paper learningRate = 0.01 from models import vgg16 model = vgg16.createNetwork(imgShape, classes, learningRate, arguments.pretrained, arguments.init) elif (network == "resnet50"): batchSize = 256 numEpochs = 2000 # 60e4 iterations in the original paper learningRate = 0.1 from models import resnet50 model = resnet50.createNetwork(imgShape, classes, learningRate) elif (network == "ext-resnet41"): batchSize = 256 numEpochs = 2000 learningRate = 0.1 from models import ext_resnet41 model = ext_resnet41.createNetwork(imgShape, classes, learningRate) elif (network == "ext-resnet50"): batchSize = 256 numEpochs = 2000 learningRate = arguments.lr from models import ext_resnet50 model = ext_resnet50.createNetwork(imgShape, classes, learningRate, arguments.activation, arguments.init, arguments.loss, arguments.do, arguments.kernel_size, arguments.kernel_number) elif (network == "ext-resnet62"): batchSize = 256 numEpochs = 2000 learningRate = 0.1 from models import ext_resnet62 model = ext_resnet62.createNetwork(imgShape, classes, learningRate) # Set featurewise mean of dataset. if K.image_data_format() == "channels_first": featurewiseMean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(3, 1, 1) featurewiseStd = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(3, 1, 1) else: featurewiseMean = np.array([0.485, 0.456, 0.406], dtype=np.float32).reshape(1, 1, 3) featurewiseStd = np.array([0.229, 0.224, 0.225], dtype=np.float32).reshape(1, 1, 3) # Create training data generator. if (arguments.data_aug == "no"): trainDataGen = ImageDataGenerator(featurewise_center=True) elif (arguments.data_aug == "yes"): trainDataGen = ImageDataGenerator(featurewise_center=True, horizontal_flip=True, vertical_flip=True, rotation_range=20, width_shift_range=0.2, height_shift_range=0.2) trainDataGen.std = featurewiseStd else: trainDataGen = ImageDataGenerator(featurewise_center=True, horizontal_flip=True, vertical_flip=True) trainDataGen.mean = featurewiseMean trainGen = trainDataGen.flow_from_directory(os.path.join(arguments.data_dir, "train"), target_size=(imgWidth, imgHeight), batch_size=batchSize) # Create validation data generator. if arguments.reconfigure_val: reconfigureValSet(arguments.data_dir) valDataGen = ImageDataGenerator(featurewise_center=True) valDataGen.mean = featurewiseMean if (arguments.data_aug == "yes"): valDataGen.std = featurewiseStd valGen = valDataGen.flow_from_directory(os.path.join(arguments.data_dir, "val"), target_size=(imgWidth, imgHeight), batch_size=batchSize) # Create directories to store output. if arguments.pretrained: network = network + "-imagenet" if not os.path.exists(os.path.join("output", network)): os.makedirs(os.path.join("output", network)) name = arguments.name.replace(" ", "_") if not os.path.exists(os.path.join("output", network, name)): os.makedirs(os.path.join("output", network, name)) outputTime = str(datetime.datetime.now()).replace(" ", "_").replace(":", ".") if not os.path.exists(os.path.join("output", network, name, outputTime)): os.makedirs(os.path.join("output", network, name, outputTime)) if not os.path.exists(os.path.join("weights", network, name)): os.makedirs(os.path.join("weights", network, name)) # Create callbacks. tbCallback = TensorBoard(log_dir="./logs/{}/{}/{}".format(network, name, outputTime), histogram_freq=0, write_grads=True, write_graph=True) lrCallback = ReduceLROnPlateau(monitor="val_categorical_accuracy", patience=5, factor=0.1, verbose=1, min_lr=0.00001) esCallback = EarlyStopping(monitor="val_categorical_accuracy", patience=10, verbose=1, min_delta=0.0001) # Fit model to the dataset... model.fit_generator(trainGen, steps_per_epoch=lenTrainData // batchSize, epochs=numEpochs, validation_data=valGen, validation_steps=lenValData // batchSize, callbacks=[tbCallback, lrCallback, esCallback]) # Save the model... model.save_weights(os.path.join("weights", network, name, outputTime + ".h5"))
def doScript(): VERBOSE = False parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=banner()) parser.add_argument("--bake", action="store_true", help="Compile templates and SASS (yum!)") parser.add_argument( "--clean", action="store_true", help= "Clean up the mess (mom would be proud!) [Selected when no options are given]" ) parser.add_argument( "--controls", action="store_true", help="Generate slide control files (gonna have something already baked)" ) parser.add_argument("--controlsonly", action="store_true", help="Only generate control files") parser.add_argument( "--dev", action="store_true", help= "Use the quick-bake test kitchen environment (no screenshots, no packaging). This is a shortcut to using go --clean --watch --veev2rel" ) parser.add_argument("--init", action="store_true", help="Initialize a new VELVEEVA project") parser.add_argument("--nuke", action="store_true", help="Nuke old builds and temp files") parser.add_argument("--nobake", action="store_true", help="Don't bake it...") parser.add_argument("--package", action="store_true", help="Wrap it up [Selected when no options are given]") parser.add_argument( "--packageonly", action="store_true", help="Just wrap it up (you gotta already have something baked)") parser.add_argument("--publish", action="store_true", help="Ship it off to market") parser.add_argument( "--publishonly", action="store_true", help= "(Only) ship it off to market (you gotta already have something baked, and control files generated)" ) parser.add_argument( "--relink", action="store_true", help= "Make some href saussage (replace relative links with global and convert to veeva: protocol)" ) parser.add_argument( "--screenshots", action="store_true", help="Include Screenshots [Selected when no options are given]") parser.add_argument("--veev2rel", action="store_true", help="Convert veeva: hrefs to relative links") parser.add_argument("--verbose", action="store_true", help="Chatty Cathy") parser.add_argument("--watch", action="store_true", help="Watch for changes and re-bake on change") if len(sys.argv) == 1: parser.print_help() sys.exit(0) args = parser.parse_args() config = parse_config() SOURCE_DIR = config['MAIN']['source_dir'] DEST_DIR = config['MAIN']['output_dir'] GLOBALS_DIR = config['MAIN']['globals_dir'] PARTIALS_DIR = config['MAIN']['partials_dir'] TEMPLATES_DIR = config['MAIN']['templates_dir'] ZIPS_DIR = config['MAIN']['zips_dir'] ### CTL File Info ### CTLS_DIR = config['MAIN']['ctls_dir'] VEEVA_USERNAME = config['VEEVA']['username'] VEEVA_PASSWORD = config['VEEVA']['password'] VEEVA_SERVER = config['VEEVA']['server'] VEEVA_EMAIL = config['VEEVA'].get('email', None) ROOT_DIR = os.getcwd() CONFIG_FILE_NAME = "VELVEEVA-config.json" PROJECT_NAME = config['MAIN']['name'] VELVEEVA_DIR = os.path.dirname(os.path.abspath(inspect.stack()[0][1])) #π©π print(banner()) print("π %s π\n" % paint.bold.yellow(PROJECT_NAME)) try: with ProgressBar(max_value=11, widgets=[Bar(marker="π"), Percentage()], redirect_stdout=True) as progress: #0. nuke print("π₯ %s" % paint.gray("Nuking old builds...")) progress.update(1) nuke(ROOT_DIR, config) #1. scaffold needed folders π print("π %s" % paint.gray("Creating directories...")) progress.update(2) scaffold(ROOT_DIR, config) #2. inline local (non-html) files, and create build folders π print("π %s " % paint.gray("Inlining partials and globals...")) progress.update(3) copy_locals(ROOT_DIR, SOURCE_DIR, DEST_DIR) #3. inline partials and globals progress.update(4) cmd = os.path.join(VELVEEVA_DIR, "lib", "inject.py") for out in execute( ["python3", cmd, ROOT_DIR, GLOBALS_DIR, DEST_DIR]): print(out) #4. render sass π print("π %s " % paint.gray("Compiling SASS...")) progress.update(5) cmd = os.path.join(VELVEEVA_DIR, "lib", "compile_sass.py") for out in execute( ["python3", cmd, os.path.join(ROOT_DIR, DEST_DIR)]): print(out) #5. render templates π print("π %s " % paint.gray("Rendering templates...")) progress.update(6) cmd = os.path.join(VELVEEVA_DIR, "lib", "render_templates.py") for out in execute([ "python3", cmd, os.path.join(ROOT_DIR, SOURCE_DIR), os.path.join(ROOT_DIR, DEST_DIR), os.path.join(ROOT_DIR, TEMPLATES_DIR), os.path.join(ROOT_DIR, PARTIALS_DIR) ]): print(out) #6. take screenshots πΈ print("πΈ %s " % paint.gray("Taking screenshots...")) progress.update(7) cmd = os.path.join(VELVEEVA_DIR, "lib", "screenshot.py") src = os.path.abspath(os.path.join(ROOT_DIR, DEST_DIR)) cfg = os.path.abspath(os.path.join(ROOT_DIR, CONFIG_FILE_NAME)) for out in execute(["python3", cmd, src, cfg]): print(out) #7. package slides π¬ progress.update(8) print("π¬ %s " % paint.gray("Packaging slides...")) cmd = os.path.join(VELVEEVA_DIR, "lib", "package_slides.py") for out in execute([ "python3", cmd, os.path.join(ROOT_DIR, DEST_DIR), os.path.join(ROOT_DIR, DEST_DIR, ZIPS_DIR) ]): print(out) #8. generate control files β print("β %s " % paint.gray("Generating .ctl files...")) progress.update(9) cmd = os.path.join(VELVEEVA_DIR, "lib", "genctls.py") flags = [ "python3", cmd, "--root", ROOT_DIR, "--src", os.path.abspath(os.path.join(ROOT_DIR, DEST_DIR, ZIPS_DIR)), "--out", os.path.abspath(os.path.join(ROOT_DIR, DEST_DIR, CTLS_DIR)), "--u", VEEVA_USERNAME, "--pwd", VEEVA_PASSWORD ] if VEEVA_EMAIL is not None: flags = flags + ["--email", VEEVA_EMAIL] for out in execute(flags): print(out) #9. ftp π print("π %s " % paint.gray("Publishing to Veeva FTP server...")) progress.update(10) cmd = os.path.join(VELVEEVA_DIR, "lib", "publish.py") for out in execute([ "python3", cmd, "--zip", os.path.abspath(os.path.join(ROOT_DIR, DEST_DIR, ZIPS_DIR)), "--ctl", os.path.abspath(os.path.join(ROOT_DIR, DEST_DIR, CTLS_DIR)), "--host", VEEVA_SERVER, "--u", VEEVA_USERNAME, "--pwd", VEEVA_PASSWORD ]): print(out) #done! progress.update(11) # relinking # don't use subprocess -> import directly # concurrent build # not as shitty exception handling # file watcher architecture # all utils should use python argparse and --src SRC (e.g.) flags not strictly positional arguments # make flags required (so fails if not present) # unified banner printer except Exception as e: print(paint.bold.red("\nπ© there was an error:")) print(e) sys.exit(1) print(paint.bold.green("\nπ Yum!"))
import numpy as np from osgeo import gdal, osr from s3utils import save_tiff from progressbar import Bar, Percentage, ProgressBar, ETA if __name__ == "__main__": results_dir = '/media/rmsare/GALLIUMOS/ot_results/ot-ncal/' working_dir = results_dir + 'masked/' files = os.listdir(results_dir) files = [f for f in files if 'tif' in f] pbar = ProgressBar(widgets=[Percentage(), ' ', Bar(), ' ', ETA()], maxval=len(files)) pbar.start() for i, f in enumerate(files): tile_name = f[0:10] data_dir = '/media/rmsare/GALLIUMOS/data/ot_data/tif/2m/' data = dem.DEMGrid(data_dir + tile_name + '.tif') mask = np.isnan(data._griddata) inraster = gdal.Open(results_dir + f) transform = inraster.GetGeoTransform() nbands = inraster.RasterCount ncols = inraster.RasterXSize nrows = inraster.RasterYSize
def search_whoosh_files(filename_in): ngt1 = RegexTokenizer() | NgramFilter(4) l_aux_i = 0 filename_aux = "dataset_match_" + filename_in ix1 = open_dir("index_" + filename_in) #aux max val for progress bar if filename_in == "jrc_person": max_val = 3000000 else: max_val = 3000000 widgets = [ 'Progress Searching ' + filename_in + ': ', Percentage(), ' ', Bar(marker='0', left='[', right=']'), ' ', ETA(), ' ' ] pbar = ProgressBar(widgets=widgets, maxval=max_val) #454000 pbar.start() with ix1.searcher() as searcher: parser = MultifieldParser(['title'], ix1.schema) parser.remove_plugin_class(qparser.WildcardPlugin) parser.remove_plugin_class(qparser.PlusMinusPlugin) with open("dataset_non_match_" + filename_in + ".csv_tmp", 'w', encoding="utf-8") as inW2: with open("dataset_match_" + filename_in + ".csv", encoding="utf8") as csvfile: for row in csvfile: l_aux_i = l_aux_i + 1 if l_aux_i % 20000 == 0: print("Index search" + str(l_aux_i)) pbar.update(l_aux_i) l_row_idx = row.split('|')[0] l_row_aux = row.split('|')[1] search_list = [token.text for token in ngt1(l_row_aux)] if len(search_list) > 0: l_row_str = random.sample(search_list, 1) query = parser.parse(l_row_str[0]) results = searcher.search(query) results_aux = [] for result in results: if result['id'] != l_row_idx: results_aux.append( [result['id'], result['title']]) if len(results_aux) > 0: shuffle(results_aux) line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[ 0][0] + "|" + results_aux[0][1] inW2.write(line_new.strip() + '\n') if len(results_aux) > 1: if results_aux[1][0] != results_aux[0][0]: line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[ 1][0] + "|" + results_aux[1][1] inW2.write(line_new.strip() + '\n') if len(results_aux) > 2: if results_aux[2][0] != results_aux[1][0]: line_new = l_row_idx + "|" + l_row_aux + "|" + results_aux[ 2][0] + "|" + results_aux[2][1] inW2.write(line_new.strip() + '\n') pbar.finish()
def consolidate(): """ Converts previous archive data model to new one. """ session = Session() try: log.verbose('Checking archive size ...') count = session.query(ArchiveEntry).count() log.verbose( 'Found %i items to migrate, this can be aborted with CTRL-C safely.' % count) # consolidate old data from progressbar import ProgressBar, Percentage, Bar, ETA widgets = [ 'Process - ', ETA(), ' ', Percentage(), ' ', Bar(left='[', right=']') ] bar = ProgressBar(widgets=widgets, maxval=count).start() # id's for duplicates duplicates = [] for index, orig in enumerate(session.query(ArchiveEntry).yield_per(5)): bar.update(index) # item already processed if orig.id in duplicates: continue # item already migrated if orig.sources: log.info( 'Database looks like it has already been consolidated, ' 'item %s has already sources ...' % orig.title) session.rollback() return # add legacy task to the sources list orig.sources.append(get_source(orig.task, session)) # remove task, deprecated .. well, let's still keep it .. #orig.task = None for dupe in session.query(ArchiveEntry).\ filter(ArchiveEntry.id != orig.id).\ filter(ArchiveEntry.title == orig.title).\ filter(ArchiveEntry.url == orig.url).all(): orig.sources.append(get_source(dupe.task, session)) duplicates.append(dupe.id) if duplicates: log.info('Consolidated %i items, removing duplicates ...' % len(duplicates)) for id in duplicates: session.query(ArchiveEntry).filter( ArchiveEntry.id == id).delete() session.commit() log.info('Completed! This does NOT need to be ran again.') except KeyboardInterrupt: session.rollback() log.critical('Aborted, no changes saved') finally: session.close()
def do(controller): """ """ # get the controller command cmd = controller.command # get the command line arguments and options args = controller.pargs # predicate to remove non-polymer atoms from structure nonpolymers = oechem.OEOrAtom( OEAtomHasIntData(('entity_type_bm', 0)), OEAtomBinaryAndIntData(('entity_type_bm', 3))) assemblysets = get_assembly_sets(args) # directory containing all the biological assemblies in OEB format OEB_ASSEMBLIES_DIR = app.config.get('directories', 'quat_oeb') # directory where surface areas will be written CREDO_DATA_DIR = app.config.get('directories', 'credo_data') ifs = oechem.oemolistream() ifs.SetFormat(oechem.OEFormat_OEB) # initialize progressbar if args.progressbar: bar = ProgressBar(widgets=[ 'PDB entries: ', SimpleProgress(), ' ', Percentage(), Bar() ], maxval=len(assemblysets)).start() # iterate through assembly sets for counter, (pdb, assemblyset) in enumerate(assemblysets, 1): if args.progressbar: bar.update(counter) # create a data directory for this structure to which all data will be written struct_data_dir = os.path.join(CREDO_DATA_DIR, pdb[1:3].lower(), pdb.lower()) # make necessary directories recursively if they do not exist yet if not exists(struct_data_dir): os.makedirs(struct_data_dir) # path to the file where the atom surface areas of all atoms will be written surface_areas_path = os.path.join( struct_data_dir, 'binding_site_atom_surface_areas.credo') # do not recalculate atom surface area contributions if incremental if args.incremental and exists( surface_areas_path) and getsize(surface_areas_path) > 0: continue elif (args.update and exists(surface_areas_path) and getmtime(surface_areas_path) >= time() - (args.update * 60 * 60 * 24) and getsize(surface_areas_path)): app.log.info("Output for PDB entry {0} exists and is more recent than {1} days. Skipped."\ .format(pdb, args.update)) continue # output file stream and CSV writer atomfs = open(surface_areas_path, 'w') atomwriter = csv.writer(atomfs, dialect='tabs') # deal with each found assembly separately # some pdb entries consist of more than one for assembly in assemblyset: if args.quat: path = os.path.join(OEB_ASSEMBLIES_DIR, pdb[1:3].lower(), pdb.lower(), assembly) else: app.log.error("the calculation of buried ligand surface areas " "is only supported for quaternary structures.") sys.exit(1) if not os.path.isfile(path): app.log.warn("cannot calculate buried surface areas: " "file {} does not exist!".format(path)) # get the quaternary structure ifs.open(str(path)) try: assembly = ifs.GetOEGraphMols().next() except StopIteration: assembly = None if not assembly: app.log.warn( "cannot calculate buried surface areas: " "file {} does not contain a valid molecule!".format(path)) continue if not assembly.GetListData('ligands'): continue # identifier of the assembly assembly_serial = assembly.GetIntData('assembly_serial') # remove all non-polymers from assembly for atom in assembly.GetAtoms(nonpolymers): assembly.DeleteAtom(atom) # ignore bizarre assemblies if not assembly.NumAtoms(): app.log.warn( "cannot calculate buried surface areas: " "file {} contains assembly with no atoms!".format(path)) continue # keep only the location state with the largest average occupancy assembly_hi_occ = oechem.OEGraphMol() altlocfactory = oechem.OEAltLocationFactory(assembly) altlocfactory.MakeCurrentAltMol(assembly_hi_occ) # get the ligands ligands = assembly_hi_occ.GetListData('ligands') # iterate through all ligands of the biomolecule and calculate the buried # surface area atom contributions for all involved atoms for ligand in ligands: # ignore small ligands if oechem.OECount(ligand, oechem.OEIsHeavy()) < 7: continue entity_serial = ligand.GetIntData('entity_serial') # keep only the location state with the largest average occupancy altlig = oechem.OEGraphMol() altlocfactory = oechem.OEAltLocationFactory(ligand) altlocfactory.MakeCurrentAltMol(altlig) cmplx_srf = oespicoli.OESurface() ligand_srf = oespicoli.OESurface() # make solvent-accessible surface of ligand oespicoli.OEMakeAccessibleSurface(ligand_srf, altlig, 0.5, 1.4) # get the atom contributions of the assembly surface ligand_atom_areas = get_atom_surface_areas(altlig, ligand_srf) # extract the binding site of the assembly to speed up surface # area calculation binding_site = get_binding_site(assembly_hi_occ, altlig) # make solvent-accessible surface of binding site binding_site_srf = oespicoli.OESurface() oespicoli.OEMakeAccessibleSurface(binding_site_srf, binding_site, 0.5, 1.4) # get the atom contributions of the assembly surface binding_site_atom_areas = get_atom_surface_areas( binding_site, binding_site_srf) # create complex cmplx = oechem.OEGraphMol() oechem.OEAddMols(cmplx, binding_site) oechem.OEAddMols(cmplx, altlig) # make solvent-accessible surface of the complex oespicoli.OEMakeAccessibleSurface(cmplx_srf, cmplx, 0.5, 1.4) # surface area atom contributions of the whole complex cmplx_atom_areas = get_atom_surface_areas(cmplx, cmplx_srf) ## extract the atom surface areas in the bound state through slices binding_site_atom_areas_bound = cmplx_atom_areas[:binding_site. NumAtoms()] ligand_atom_areas_bound = cmplx_atom_areas[binding_site. NumAtoms():] # difference between apo and bound state per polymer atom binding_site_delta = binding_site_atom_areas - binding_site_atom_areas_bound ligand_delta = ligand_atom_areas - ligand_atom_areas_bound # boolean map indicating for which atom the surface area has changed binding_site_atom_map = binding_site_delta != 0 ligand_atom_map = ligand_delta != 0 if args.dry_run: continue # only record the atoms where the solvent-accessible surface # area has actually changed write_atoms(atomwriter, binding_site, binding_site_atom_map, pdb, assembly_serial, entity_serial, binding_site_atom_areas, binding_site_atom_areas_bound) # only record the atoms where the solvent-accessible surface area # has actually changed write_atoms(atomwriter, altlig, ligand_atom_map, pdb, assembly_serial, entity_serial, ligand_atom_areas, ligand_atom_areas_bound) app.log.debug("wrote buried surface areas for all ligands in " "biomolecule {} to {}.".format( pdb, surface_areas_path)) atomfs.flush() atomfs.close() if args.progressbar: bar.finish()
import sys from progressbar import ProgressBar, Bar if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('abstract') parser.add_argument('-o', '--output', default='data-08-2018') args = parser.parse_args() abstracts = defaultdict(str) titles = defaultdict(str) with open(args.abstract) as absfile: for line in absfile: id, doi, title, abstract = line.split('\t', 3) titles[id] = title abstracts[id] = title if not os.path.exists(args.output): os.makedirs(args.output) pbar = ProgressBar(widgets=[Bar()]) for key in pbar(abstracts.keys()): filename = os.path.join(args.output, key) with open(filename, 'w') as outfile: outfile.write(titles[key] + '\n') outfile.write(abstracts[key] + '\n') outfile.write(' \n')
from collections import defaultdict from collections import OrderedDict import pandas as pd import matplotlib.pyplot as plt from progressbar import AnimatedMarker, Bar, BouncingBar, Counter, ETA, \ FileTransferSpeed, FormatLabel, Percentage, \ ProgressBar, ReverseBar, RotatingMarker, \ SimpleProgress, Timer # progress bar settings widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA() ] def getTaggedMovies(): tagged_movies = set() with io.open("datasets/ml-latest-small/folksonomy.csv", "r", encoding="ISO-8859-1") as file: for line in file: tokens = line.strip().split("\t") movieid = int(tokens[0], 10) tagged_movies.add(movieid) return tagged_movies
def main(pdfdir, textdir): dirlist = [fn for fn in os.listdir(pdfdir) if fn.endswith('.pdf')] print 'Extracting text, using Tika, from %d files in %s.' % \ (len(dirlist), pdfdir) print ' Writing output text files to %s.' % textdir if not os.path.exists(textdir): os.mkdir(textdir) widgets = [ 'Files (of %d): ' % len(dirlist), Percentage(), ' ', Bar('='), ' ', ETA() ] pbar = ProgressBar(widgets=widgets, maxval=len(dirlist)).start() for (i, fn) in enumerate(dirlist): pbar.update(i) #if int(fn.split('.')[0]) != 1001: # continue #print fn parsed = parser.from_file(pdfdir + '/' + fn) try: if parsed['content'] == None: print 'Tika found no content in %s.' % fn import pdb pdb.set_trace() continue except: print 'Tika could not parse %s.' % fn continue with io.open(textdir + '/' + fn[0:-4] + '.txt', 'w', encoding='utf8') as outf: cleaned = parsed['content'] # Translate some UTF-8 punctuation to ASCII punc = { 0x2018: 0x27, 0x2019: 0x27, # single quote 0x201C: 0x22, 0x201D: 0x22, # double quote 0x2010: 0x2d, 0x2011: 0x2d, 0x2012: 0x2d, 0x2013: 0x2d, # hyphens 0xF0B0: 0xb0, # degree 0xFF0C: 0x2c, # comma 0x00A0: 0x20, # space 0x2219: 0x2e, 0x2022: 0x2e, # bullets } # 0x005E:0x5e, 0x02C6:0x5e, 0x0302:0x5e, 0x2038:0x5e, # carets # 0x00B0:0x6f, 0x02DA:0x6f, # degree # 0x00B9:0x31, 0x00B2:0x32, 0x00B3:0x33, # exponents cleaned = cleaned.translate(punc) # Replace newlines that separate words with a space (unless hyphen) cleaned = re.sub(r'([^\s-])[\r|\n]+([^\s])', '\\1 \\2', cleaned) # Remove hyphenation at the end of lines # (this is sometimes bad, as with "Fe-\nrich") cleaned = cleaned.replace('-\n', '\n') # Remove all newlines cleaned = re.sub(r'[\r|\n]+', '', cleaned) # Remove xxxx.PDF cleaned = re.sub(r'([0-9][0-9][0-9][0-9].PDF)', '', cleaned, flags=re.IGNORECASE) # And "xx(th|st) Lunar and Planetary Science Conference ((19|20)xx)" # with optional parentheses, optional LPI contrib cleaned = re.sub( r'([0-9][0-9].. Lunar and Planetary Science Conference \(?(19|20)[0-9][0-9]\)?)( \(LPI Contrib. No. [0-9][0-9][0-9][0-9]\))? ?', '', cleaned, flags=re.IGNORECASE) # And "Lunar and Planetary Science XXXIII (2002)" # with Roman numeral and optional year cleaned = re.sub( r'(Lunar and Planetary Science [CDILVXM]+( \((19|20)[0-9][0-9]\))?) ?', '', cleaned, flags=re.IGNORECASE) # Remove mailto: links cleaned = re.sub(r'mailto:[^\s]+', '', cleaned) outf.write(cleaned) outf.close()