def execute_command(self): args = self.parser.parse_args() d = DataProvider() data = d.extract_data() x = CalculationLogic() if args.task == 't1': if args.district and args.year: if args.gender: print(x.task1(data, args.year, args.district,args.gender)) else: print(x.task1(data, args.year, args.district)) elif args.task == 't2': if args.district: if args.gender: print(x.task2(data,args.district,args.gender)) else: print(x.task2(data, args.district)) elif args.task == 't3': if args.year: if args.gender: print(x.task3(data,args.year,args.gender)) else: print(x.task3(data,args.year)) elif args.task == 't4': if args.gender: print(x.task4(data,args.gender)) else: print(x.task4(data)) elif args.task == 't5': if args.district and args.district2: if args.gender: print(x.task5(data,args.district,args.district2,args.gender)) else: print(x.task5(data,args.district,args.district2))
def train(args): if not os.path.isdir('./%s' % args.output): os.system('mkdir ./%s' % args.output) if args.cont: try: model = torch.load(args.cont) print('load success') except: model = OwnModel() else: model = OwnModel() dp = DataProvider(args.dataset) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-5) crit = torch.nn.CrossEntropyLoss() mbsize = 1024 for epoch in range(args.max_iter): odata, olabel = dp.train_iter(mbsize) data = Variable(torch.from_numpy(odata)) label = Variable(torch.from_numpy(olabel)) lr = get_lr(model.iter) pred = model(data) pred = pred.contiguous().view(-1, 2) loss = crit(pred, label) optimizer.zero_grad() for group in optimizer.param_groups: group['lr'] = lr loss.backward() optimizer.step() print('iter:%s loss:%s' % (epoch, loss.data.numpy()), end='\r') if epoch % 10 == 0: torch.save(model, './%s/model.pkl' % args.output)
def validate(s2s, n_samples): bsize = 1 # batch size vgen = DataProvider(n_samples, mlen, batch_size=bsize) correct = 0 correct_elements = 0 total = 0 total_elements = 0 for _ in range(n_samples): batch, slen, _d_inputs, _d_seqlen, _targets_e, _targets_d = vgen.next() inp = [] for i, b1 in enumerate(batch): for j in range(slen[i]): inp.append(chr(b1[j].index(1))) e_results, results = s2s.do_inference(batch, slen, vgen) pred = [chr(np.argmax(e_results))] for i, result in enumerate(results): for j, res in enumerate(result): # for each seq in a mini batch pred.append(chr(np.argmax(res))) pred = pred[:-1] # ignore the end char print "Inp: ", inp print "Prd: ", pred for c1, c2 in zip(inp, pred): if c1 == c2: correct_elements += 1 total_elements += 1 if inp == pred: correct += 1 total += 1 print "EXACT match validation accuracy: ", (float(correct) / total) * 100, "%" print "Elementwise match validation accuracy: ", ( float(correct_elements) / total_elements) * 100, "%"
def train(config, network_spec=None): data_provider = DataProvider(config.db) env = StockEnvironment(data_provider, config, 0) agent = overwrite_agent(env, network_spec, config) if config.overwrite_agent else load_agent( config, env, network_spec) mlflow.log_param("agent", "tensorforce.agents.DQNAgent") for key in config.agent_specs: mlflow.log_param(key, config.agent_specs[key]) runner = Runner(agent=agent, environment=env) offset = 20000 num_episodes = 20 step = 0 while data_provider.has_data_key(offset + config.max_step_per_episode): runner.run(num_episodes=num_episodes) offset = offset + config.max_step_per_episode env.offset = offset agent.save(config.agent_dir, config.agent_name) if step % 10 == 0: evaluate(config, data_provider, offset - config.max_step_per_episode, agent) step += 1 return agent, env
def __init__(self, uriList, streamType, moreVars, expandPatterns=True): DataProvider.__init__(self, streamType, moreVars) self.expandPatterns = expandPatterns if "expandPatterns" in moreVars: if self.expandPatterns=="False" or self.expandPatterns=="false": self.expandPatterns = False del moreVars["expandPatterns"] if self.expandPatterns: fileNamePatterns = [] for f in uriList: if f.startswith("file://"): fileNamePatterns.append(f[7:]) else: fileNamePatterns.append(f) else: fileNamePatterns = uriList if len(fileNamePatterns) == 0: raise NoFilesSpecified() self.myFileNames = [name for name in expandFiles(fileNamePatterns, shouldOpen=False, checkPattern=self.expandPatterns) ] if not self._streamType: colonPos = self.myFileNames[0].rfind(":") if colonPos==-1 or (colonPos==1 and len(self.myFileNames)>=3 and self.myFileNames[2]=='\\'): name = self.myFileNames[0] else: name = self.myFileNames[0][0:colonPos] ext = os.path.splitext(name)[1] if ext == ".gz": # try to get prev extension before gzip ext = os.path.splitext(os.path.splitext(name)[0])[1] self._streamType = io_targets.getTypeByExtension(ext) if not self._streamType: raise UnknownExtensionType(name)
def __init__(self, rss_url, max_news=2): """ .ctor :return: """ DataProvider.__init__(self, max_news=max_news) self.rss_url = rss_url self.max_new_article = max_news
def main(): # read data dp = DataProvider() dp.read_data("train.csv") if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR) create_model(dp)
def preparation(): """Prepare databases and start background tasks.""" # Kill old processes if running kill_bg_servers() time.sleep(10) data_dir = config.DATA_DIR log.info("Removing Databases.") with contextlib.suppress(FileNotFoundError): # Remove Bloom Filter os.remove(data_dir + config.BLOOM_FILE) # Remove Databases os.remove(data_dir + config.KEYSERVER_DB) os.remove(data_dir + config.STORAGE_DB) # Add User log.info("Prepare User DB.") db.main(UserType.CLIENT, ['testuser', 'password', '-a'], no_print=True) db.main(UserType.OWNER, ['testprovider', 'password', '-a'], no_print=True) log.info("Starting Background Servers.") subprocess.run([f"{config.WORKING_DIR}src/allStart.sh", "eval"]) time.sleep(10) # Create data provider client d = DataProvider('testprovider') d.set_password('password') # Check that servers are really online tries = 0 done = False while not done: try: if tries >= 1: # Try to start servers again. kill_bg_servers() time.sleep(10) subprocess.run( [f"{config.WORKING_DIR}src/allStart.sh", "eval"]) time.sleep(10) tries = 0 # Check Key Server d.get_token(ServerType.KeyServer) # Check celery r = d.get(d.KEYSERVER.replace('provider', 'celery')) if r.content != b"True": raise RuntimeError("Celery of keyserver not started.") # Check Storage Server d.get_token(ServerType.StorageServer) # Check celery r = d.get(d.STORAGESERVER.replace('provider', 'celery')) if r.content != b"True": raise RuntimeError("Celery of storage-server not started.") # Success done = True except Exception as e: log.error(f"Server not up, yet. Try: {tries}. Error: {str(e)}") tries += 1 time.sleep(5)
def rnn(): data_provider = DataProvider(data_dir, BATCH_SIZE, SEQUENCE_LENGTH) model = RNNModel(data_provider.vocabulary_size, batch_size=BATCH_SIZE, sequence_length=SEQUENCE_LENGTH, hidden_layer_size=HIDDEN_LAYER_SIZE, cells_size=CELLS_SIZE) with tf.Session() as sess: summaries = tf.summary.merge_all() writer = tf.summary.FileWriter(tensorboard_dir) writer.add_graph(sess.graph) sess.run(tf.global_variables_initializer()) # Forward pass and one backward pass of all the training examples epoch = 0 temp_losses = [] smooth_losses = [] while True: sess.run( tf.assign(model.learning_rate, LEARNING_RATE * (DECAY_RATE**epoch))) data_provider.reset_batch_pointer() state = sess.run(model.initial_state) for batch in range(data_provider.batches_size): inputs, targets = data_provider.next_batch() feed = {model.input_data: inputs, model.targets: targets} for index, (c, h) in enumerate(model.initial_state): feed[c] = state[index].c feed[h] = state[index].h # Iteration is the number of times batch data has passed # through the neural network - both forward and backwards # propagation iteration = epoch * data_provider.batches_size + batch summary, loss, state, _ = sess.run( [summaries, model.cost, model.final_state, model.train_op], feed) writer.add_summary(summary, iteration) temp_losses.append(loss) if iteration % SAMPLING_FREQUENCY == 0: sample_model(sess, data_provider, iteration) if iteration % LOGGING_FREQUENCY == 0: smooth_loss = np.mean(temp_losses) smooth_losses.append(smooth_loss) temp_losses = [] plot(smooth_losses, "iterations (thousands)", "loss") print('{{"metric": "iteration", "value": {}}}'.format( iteration)) print('{{"metric": "epoch", "value": {}}}'.format(epoch)) print('{{"metric": "loss", "value": {}}}'.format( smooth_loss)) epoch += 1
def main(): # read data dp = DataProvider() xgb_model = XGBClassifier() dp.read_data("train.csv") if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR) # create_xgbmodel(dp,xgb_model,device="gpu") opt = Optimizer() tune_with_TPE(dp, xgb_model, opt)
def show_patches(): from data_provider import DataProvider dp = DataProvider(True, ['g0']) dp.set_batch_size(10) while True: batch = dp.get_batch() for img in batch[0]: #img = img / np.mean(img, axis=(0, 1))[None, None, :] img = img / img.max() cv2.imshow("Input", np.power(img, 1 / 2.2)) cv2.waitKey(0)
def test_epoch_complete(self): # check if every element of the dataset is really seen at the end provider = DataProvider('final_data', 8) dataset_img = [img.tostring() for img in provider.images] while provider.next_batch_available(): batch_img, _ = provider.get_batch() for img in batch_img: if img.tostring() in dataset_img: dataset_img.remove(img.tostring()) self.assertEqual(len(dataset_img), 0)
def main(parameters): data_provider = DataProvider() train_loader, test_loader = data_provider.get_data_loaders(**parameters) writer = SummaryWriter() trainer_type = parameters["experiment"] + "net" trainer = TrainerFactory.create_trainer(trainer_type, train_loader, test_loader, writer, **parameters) trainer.run(parameters["epochs"]) writer.close()
def test_get_casted_dataframe(self, mock_method): date_column = pd.date_range(start=datetime.datetime.today(), periods=4) mock_method.side_effect = [ pd.DataFrame([ ' min ', 'asdasdasd0', ' ciao', 'ciao ' ], dtype='object'), pd.DataFrame(['UD', ' O', 'P ', ' TS '], dtype='object'), pd.DataFrame([0, 1, 1, 1]), pd.DataFrame((([np.nan] * 3) + [0.24]), dtype='float64'), pd.DataFrame(date_column) ] # Creo dataframe di test, uguale al self.dp.df solo con stringhe data = { 'col1': [' min ', 'asdasdasd0', ' ciao', 'ciao '], 'col2': ['UD', ' O', 'P ', ' TS '], 'col3': ['0', '1', '1', '1'], 'col4': ([np.nan] * 3) + ['0.24'], 'col5': date_column.strftime("%Y-%m-%d") # Cast date to string } df = pd.DataFrame(data) # Creo DataProvider di test con il dataframe solo di stringhe test_dp = DataProvider(df=df, column_types={ 0: 'object', 1: 'object', 2: 'int', 3: 'float', 4: 'date' }, column_constraints={ 0: False, 1: False, 2: True, 3: False, 4: False }) # Effettuo il casting casted_df = test_dp.get_casted_dataframe() self.assertEqual(casted_df.dtypes.tolist(), [ np.dtype('O'), np.dtype('O'), np.dtype('int64'), np.dtype('float64'), np.dtype('<M8[ns]') ])
def __init__(self, part='2', img_width=28, filter_width=28, num_filters=2, num_classes=2, alpha=.01, activation_function='sigmoid', relu_alpha=0, sig_lambdas=(1, 1, 1), subset_size=1, tanh_lambda=1): self.part = part if self.part == '2': self.filter_width = 28 self.num_filters = 2 num_classes = 2 train_dir = '../data/part2/train/*' test_dir = '../data/part2/train/*' if self.part == '3a' or part == '3b': self.filter_width = 7 self.num_filters = 16 num_classes = 10 train_dir = '../data/part3/train/*' test_dir = '../data/part3/train/*' self.img_width = img_width self.output_dim = num_classes self.alpha = alpha self.activation_function = activation_function self.relu_alpha = relu_alpha self.sig_lambdas = sig_lambdas self.tanh_lambda = tanh_lambda #computed properties self.conv_mat_H = np.power((img_width - self.filter_width + 1), 2) #number kernel positions self.conv_mat_S = img_width - self.filter_width #space between kerel and outside of image self.conv_output_dim = self.conv_mat_H * self.num_filters #create data provider to feed in data self.dp = DataProvider(train_dir, test_dir, num_classes, subset_size) if part == '2' or part == '3a': self.init_weights_3A() else: self.init_weights_3B()
def __init__(self, cfg): DataProvider.__init__(self, cfg) # Load training images (path) and labels train_path = os.path.join(Paths.data_path, 'cell/labels/train.csv') test_path = os.path.join(Paths.data_path, 'cell/labels/test.csv') data_type = {'image_name': np.str, 'label': np.int} self._train_df = pd.read_csv(train_path, dtype=data_type) self._test_df = pd.read_csv(test_path, dtype=data_type) self._train_list = list(self._train_df.index) # random.shuffle(self._train_list) self._test_list = list(self._test_df.index) self._test_size = len(self._test_list) self._train_index = 0 self._test_index = 0
def test_get_column_constraints_is_respected_NotImplemented(self): # Creo istanza di Data Provider data = { # duplicando l'ultimo valore della prima colonna 'col1': ['222365896', '522559845', '333652214', '522559845'], 'col2': ['UD', ' O', 'P ', ' TS '] } df = pd.DataFrame(data) col_types = {0: 'object', 1: 'object'} dp = DataProvider(df, col_types, column_constraints=NotImplemented) # Test valore corrispondente duplicated_values = dp.get_column_constraints_is_respected() pd.testing.assert_series_equal(duplicated_values, pd.Series([], dtype='object'))
def main(): dp = DataProvider() test_data = dp.get_test_data() model_name="rando:0%reg_a:0%max_d:0%subsa:1%boost:gbtree%nthre:8%colsa:1%learn:0.025%scale:5.2872645858027125%max_d:3%missi:None%gamma:0%base_:0.5%colsa:1%min_c:2%seed:100%n_job:1%silen:0%n_est:800%reg_l:1%objec:binary:logistic%" path="/home/msaffarm/KaggleChallenges/SafeDriverPred/xgbModel/trainedModels/" + model_name model = get_model(path) test_ids = test_data[["id"]].as_matrix() test_data.drop(["id"], axis=1,inplace=True) preds = model.get_booster().predict(xgb.DMatrix(test_data)) final_pred = np.concatenate([test_ids.reshape(-1,1),preds.reshape(-1,1)],axis=1) final_pred_df = pd.DataFrame(final_pred,columns=["id","target"]) final_pred_df["id"] = final_pred_df["id"].astype(int) print(final_pred_df) final_pred_df.to_csv("predictions.csv",index=False)
def main(): # parse config config_file = sys.argv[1] config = Config(config_file) # setup logger setup_logging(config.working_dir) # encoding func encoding_func = ENCODING_METHOD_MAP[config.encoding_method] encoding_func2= ENCODING_METHOD_MAP[config.encoding_method2] log_to_file('Encoding method2', config.encoding_method2) data_provider=[] for p in range(config.base_model_count): temp_provider = DataProvider( encoding_func, encoding_func2, config.data_file, config.test_file, config.batch_size, max_len_hla=config.max_len_hla, max_len_pep=config.max_len_pep, model_count=config.model_count ) data_provider.append(temp_provider) log_to_file('max_len_hla', data_provider[0].max_len_hla) log_to_file('max_len_pep', data_provider[0].max_len_pep) test(config, data_provider[0])
def setup(trello_key, trello_secret, board_id, out, delimiter, card_extractors, filters): # validate inputs if not trello_key or not trello_secret: raise click.BadParameter('trello_secret and trello_key are required') if not board_id: raise click.BadParameter('board_id is required') trello_client = TrelloClient( api_key=trello_key, api_secret=trello_secret, ) data_provider = DataProvider(Board( trello_client, board_id=board_id, )) print(data_provider.board.name) # TODO: add logging database = DataBase(delimiter=delimiter) runner = Runner(data_provider, database, card_extractors_parameter=[ Parameter(x.strip()) for x in card_extractors.split(',') ], filters=[Parameter(x.strip()) for x in filters.split(',')] if filters else []) runner.run() database.export(out)
def show_patches(): from data_provider import DataProvider # dp = DataProvider(False, ['s0']) dp = DataProvider(True, ['s0']) dp.set_batch_size(10) while True: batch = dp.get_batch() imgs = batch[0] illums = batch[2] for i in range(len(imgs)): #img = img / np.mean(img, axis=(0, 1))[None, None, :] img = imgs[i] / imgs[i].max() illum = illums[i] print('illum: ', illum) cv2.imshow("Input", np.power(img, 1 / 2.2)) cv2.waitKey(0)
def main(): # parse config config_file = sys.argv[1] config = Config(config_file) # setup logger setup_logging(config.working_dir) # encoding func encoding_func = ENCODING_METHOD_MAP[config.encoding_method] encoding_func2 = ENCODING_METHOD_MAP[config.encoding_method2] log_to_file('Encoding method2', config.encoding_method2) data_provider = [] for p in range(config.base_model_count): temp_provider = DataProvider(encoding_func, encoding_func2, config.data_file, config.test_file, config.batch_size, max_len_hla=config.max_len_hla, max_len_pep=config.max_len_pep, model_count=config.model_count) data_provider.append(temp_provider) log_to_file('Traning samples', len(data_provider[0].train_samples[0])) log_to_file('Val samples', len(data_provider[0].validation_samples[0])) log_to_file('Traning steps', data_provider[0].train_steps()) log_to_file('Val steps', data_provider[0].val_steps()) log_to_file('Batch size', data_provider[0].batch_size) log_to_file('max_len_hla', data_provider[0].max_len_hla) log_to_file('max_len_pep', data_provider[0].max_len_pep) for p in range(config.base_model_count): train(config, data_provider[p], p)
def show_images() -> None: random.seed(10) dp = DataProvider.load_from_folder(dataset_folder) nn = NeuralNet(sizes=[784, 128, 10], epochs=10) nn.train(dp.get_train_x(), dp.get_hot_encoded_train_y(), dp.get_test_x(), dp.get_hot_encoded_test_y()) properly_classified, misclassified = nn.get_properly_classified_and_misclassified_images( dp.get_test_x(), dp.get_hot_encoded_test_y()) print('properly classified') plt.imshow(properly_classified[0].reshape(28, 28), cmap=cm.binary) plt.show() plt.imshow(properly_classified[1].reshape(28, 28), cmap=cm.binary) plt.show() plt.imshow(properly_classified[2].reshape(28, 28), cmap=cm.binary) plt.show() plt.imshow(properly_classified[3].reshape(28, 28), cmap=cm.binary) plt.show() plt.imshow(properly_classified[4].reshape(28, 28), cmap=cm.binary) plt.show() print('missclasified') plt.imshow(misclassified[0].reshape(28, 28), cmap=cm.binary) plt.show() plt.imshow(misclassified[1].reshape(28, 28), cmap=cm.binary) plt.show() plt.imshow(misclassified[2].reshape(28, 28), cmap=cm.binary) plt.show() plt.imshow(misclassified[3].reshape(28, 28), cmap=cm.binary) plt.show() plt.imshow(misclassified[4].reshape(28, 28), cmap=cm.binary) plt.show()
def test_invalid_formatting(self): invalid_values = DataProvider.get_data("generic.invalid_values.json") for value in invalid_values["text-fields"]: AddAssetPage.complete_form(self.driver, "asset_form.valid_asset.0.json") AddAssetPage.set_text_fields(self.driver, value) # Submision should not be successful assert "/add" in self.driver.current_url # Validation alert should be visible assert AddAssetPage.is_validation_message_displayed(self.driver) for value in invalid_values["non-http-urls"]: AddAssetPage.complete_form(self.driver, "asset_form.valid_asset.0.json") AddAssetPage.set_url_fields(self.driver, value) # Submision should not be successful assert "/add" in self.driver.current_url # Validation alert should be visible assert AddAssetPage.is_validation_message_displayed(self.driver) for value in invalid_values["lists"]: AddAssetPage.complete_form(self.driver, "asset_form.valid_asset.0.json") AddAssetPage.set_list_fields(self.driver, value) # Submision should not be successful assert "/add" in self.driver.current_url # Validation alert should be visible assert AddAssetPage.is_validation_message_displayed(self.driver)
def test_get_column_constraints_is_respected_strings(self): # Creo istanza di Data Provider data = { # duplicando l'ultimo valore della prima colonna 'col1': ['222365896', '522559845', '333652214', '522559845'], 'col2': ['UD', ' O', 'P ', ' TS '] } df = pd.DataFrame(data) col_types = {0: 'object', 1: 'object'} col_constraints = {0: True, 1: False} dp = DataProvider(df, col_types, col_constraints) # Test valore corrispondente duplicated_values = dp.get_column_constraints_is_respected() pd.testing.assert_series_equal(duplicated_values, pd.Series([False, False, False, True])) self.assertEqual(duplicated_values.sum(), 1)
def test_get_column_constraints_is_respected_multicolumn(self): # Creo istanza di Data Provider data = { 'col1': ['222365896', '522559845', '522559845', '522559845'], 'col2': ['UD', 'GO', 'PN', 'GO'], 'col3': [1, 2, 3, 4] } df = pd.DataFrame(data) col_types = {0: 'object', 1: 'object', 2: 'int'} col_constraints = {0: True, 1: True, 2: False} dp = DataProvider(df, col_types, col_constraints) # Test valore corrispondente duplicated_values = dp.get_column_constraints_is_respected() pd.testing.assert_series_equal(duplicated_values, pd.Series([False, False, False, True])) self.assertEqual(duplicated_values.sum(), 1)
def test_get_casted_column_for_type_date(self): s = pd.Series(['05/11/2020', '05/12/2020', '05/13/2020', '05/14/2020'], dtype='object') casted_s = DataProvider.get_casted_column_for_type(s, 'date') date_series = pd.date_range(start='05/11/2020', periods=4) pd.testing.assert_frame_equal(casted_s, pd.DataFrame(date_series)) self.assertEqual(casted_s.dtypes.tolist(), [np.dtype('<M8[ns]')])
def main(): x = tf.placeholder(tf.float32, [batch_size, 512, 512, 3]) y = tf.placeholder(tf.float32, [None, 3]) out = M.test_architecture2(x) dp = DataProvider(True, ['g0']) dp.set_batch_size(batch_size) angular_loss = angular_error_fn(out, y) nr_step = 100 saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, "tf_log/model.ckpt") for epoch in range(0, nr_epochs): for step in range(0, nr_step): batch = dp.get_batch() feed_x = batch[0] feed_y = batch[2] ans, angular_error = sess.run([out,angular_loss], feed_dict = {x: feed_x, y:feed_y}) print(str(step) + " Angular_error: " + str(angular_error)) print(ans[0]) print(feed_y[0]) img = feed_x[0] / feed_x[0].max() #cv2.imshow("Input", np.power(img, 1 / 2.2)) #cv2.waitKey(0) cv2.imwrite("data/inference/" + str(step) + "_img_input.png", 255*np.power(img, 1 / 2.2)) img_gt = sp.apply_gt(img, feed_y[0]) cv2.imwrite("data/inference/" + str(step) + "_img_gt.png", 255*np.power(img_gt, 1 / 2.2)) img_pred = sp.apply_gt(img, ans[0]) cv2.imwrite("data/inference/" + str(step) + "_img_pred.png", 255*np.power(img_pred, 1 / 2.2)) dp.stop()
def example4(): """ Neural net with 2 hidden layers 100 epochs test """ dp = DataProvider.load_from_folder(dataset_folder) nn = NeuralNet2(sizes=[784, 128, 64, 10], epochs=100) nn.train(dp.get_train_x(), dp.get_hot_encoded_train_y(), dp.get_test_x(), dp.get_hot_encoded_test_y())
def run_validation(cls, model): _, gen_val = DataProvider.get_generators() print('Evaluating model...') result = model.evaluate_generator(gen_val, use_multiprocessing=True, workers=4) print('Results:') for idx, metric in enumerate(model.metrics_names): print(f'\t{metric}: {result[idx]}')
def show_patches(): from data_provider import DataProvider dp = DataProvider(True, ['g0']) dp.set_batch_size(1) while True: batch = dp.get_batch() images = batch[0] labels = batch[2] for i in range(len(images)): img = images[i] gt = labels[i] #img = img / np.mean(img, axis=(0, 1))[None, None, :] img = img / img.max() cv2.imshow("Input", np.power(img, 1 / 2.2)) cv2.waitKey(0) img = apply_gt(img, gt) cv2.imshow("Corrected", np.power(img, 1 / 2.2)) cv2.waitKey(0)
def test_add_nlp_tags(self): payload = DataProvider.get_payload("POST_asset_add.tag_payload.0.json") # Request is made twice to check for tag duplication response_one = requests.post(ADD_ENDPOINT, data=payload) response_two = requests.post(ADD_ENDPOINT, data=payload) # Getting POS annotations nlp_response = requests.post(NLP_ENDPOINT, data=payload["asset_purpose"]) tags_to_store = [ "NN", "NNS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "JJ", "JJR", "JJS", "RB", "RBR", "RBS" ] expected_tags = [] for sentence in nlp_response.json()["sentences"]: for token in sentence["tokens"]: if token["pos"] in tags_to_store: expected_tags.append(token["lemma"]) # Checking if tags were added found_count = 0 for doc in self.db.tags.find({'type': 'NLP'}): if doc["value"] in expected_tags: found_count += 1 # Higher number means duplicates # Lower number means not generated self.assertEqual( len(expected_tags), found_count, "NLP tags not extracted correctly: Found " + str(found_count) + "; Expected: " + str(len(expected_tags))) # Should find one if assets get tagged result = self.db.tags.find({ 'type': 'NLP', 'tagged': { "$in": [response_one.json()["asset_id"]] } }).count() self.assertEqual( 1, result, "Tag has been generated but asset has not been tagged: Found:" + str(result) + "; Expected: 1") result = self.db.tags.find({ 'type': 'NLP', 'tagged': { "$in": [response_two.json()["asset_id"]] } }).count() self.assertEqual( 1, result, "Tag has been generated but asset has not been tagged: Found:" + str(result) + "; Expected: 1")
miner_globals.addAggregator("segments", "aggregate.Segments", "segments(start, size) returns aggregate.Segments object") miner_globals.addAggregator("rate", "aggregate.Rate", "rate(period)(value) gets the rates of the value over defined period") miner_globals.addAggregator("rateIf", "aggregate.RateIf", "rateIf(period)(cond, exp) gets the rates of the value over defined period filtered by the condition") miner_globals.addTargetToClassMapping("csv", "io_targets.iCSV", "io_targets.oCSV", "comma separated value text (unicode=True flag preserves unicode indication in output)") miner_globals.addTargetToClassMapping("pickle", "io_targets.iPickle", "io_targets.oPickle", "python object native serialization format") miner_globals.addTargetToClassMapping("stdout", None, "io_targets.oStdout", "dumps user friendly formatted output to stdout") miner_globals.addTargetToClassMapping("less", None, "io_targets.oLess", "dumps user friendly formatted output to less pager") miner_globals.addTargetToClassMapping("log", "io_targets.iLog", "io_targets.oLog", "Processes text file by splitting it to words. Created record is (line, words, NR).\nFS= may specify alternative regular regular expression for splitting.") miner_globals.addTargetToClassMapping("raw", "io_targets.iRaw", "io_targets.oRaw", "Processes text file without splitting into words. Record is (line,).") miner_globals.addTargetToClassMapping("json", "io_targets.iJson", "io_targets.oJson", "Reads json files to 'obj' variable or writes all variables to json list") miner_globals.addTargetToClassMapping("tsv", "io_targets.iTsv", "io_targets.oTsv", "tab separated value text") miner_globals.addExtensionToTargetMapping(".csv", "csv") miner_globals.addExtensionToTargetMapping(".tsv", "tsv") miner_globals.addExtensionToTargetMapping(".pic", "pickle") miner_globals.addExtensionToTargetMapping(".txt", "stdout") miner_globals.addExtensionToTargetMapping(".log", "log") miner_globals.addExtensionToTargetMapping(".json", "json") miner_globals.addExtensionToTargetMapping("stdout", "csv") DataProvider.registerDataProvider("file", FileDataProvider) DataProvider.registerDataProvider("repository", RepositoryDataProvider) import m.db import m.db.sqlite_engine sqliteEngine = m.db.sqlite_engine.SQLiteEngine() m.db.registerEngine("file.db", sqliteEngine) m.db.registerEngine("file.sqlite", sqliteEngine)
def setUp(self): self.data_provider = DataProvider(5, genre_dataset_size=40)
class TestDataProvider(unittest.TestCase): def setUp(self): self.data_provider = DataProvider(5, genre_dataset_size=40) def test_get_output_shape(self): self.assertEqual(self.data_provider.get_output_shape(), (5, )) def test_setup(self): self.data_provider.setup() training_data = self.data_provider.get_all_training_data() test_data = self.data_provider.get_test_data() # Check proportions of training and test sets self.assertEqual(training_data.shape[0], 180) self.assertEqual(test_data.shape[0], 20) ids = numpy.array([]) for example in training_data: ids = numpy.append(ids, example['id']) for example in test_data: ids = numpy.append(ids, example['id']) # Check the reunion of training and test examples gives the entire dataset numpy.testing.assert_array_equal(numpy.sort(ids), numpy.sort(numpy.array(5 * (range(40))))) def test_get_next_batch(self): self.data_provider.setup() genres_count = numpy.zeros((5, ), dtype=int) for i in range(18): batch = self.data_provider.get_next_batch() for example in batch: genres_count[numpy.argmax(example['out'])] += 1 numpy.testing.assert_array_equal(genres_count, numpy.array([36, 36, 36, 36, 36])) def test_get_all_training_data(self): self.data_provider.setup() training_data = self.data_provider.get_all_training_data() ids = numpy.array([]) for example in training_data: # Check training examples have ids from the dataset range self.assertIn(example['id'], range(40)) ids = numpy.append(ids, example['id']) def test_get_test_data(self): self.data_provider.setup() test_data = self.data_provider.get_test_data() ids = numpy.array([]) for example in test_data: # Check test examples have ids from the dataset range self.assertIn(example['id'], range(40)) ids = numpy.append(ids, example['id']) def test_get_test_data_for_genre(self): self.data_provider.setup() test_data_genre = self.data_provider.get_test_data_for_genre('classical') ids = numpy.array([]) for example in test_data_genre: ids = numpy.append(ids, example['id']) # Check test set for 1 genre does not contain duplicates numpy.testing.assert_array_equal(numpy.unique(ids), ids) # Check size of test set per genre self.assertEqual(ids.shape[0], 4) def test_reset(self): self.data_provider.setup() for i in range(18): batch = self.data_provider.get_next_batch() self.assertIsNotNone(batch) batch = self.data_provider.get_next_batch() self.assertIsNone(batch) self.data_provider.reset() for i in range(18): batch = self.data_provider.get_next_batch() self.assertIsNotNone(batch)
print 'username=[%s] password=[%s]' % (username, password) status, account = data_provider.get_account(username, password) result = {"status": status, "account": account} self.write(json.dumps(result, ensure_ascii=False)) class LinkerManagerRequestHandler(tornado.web.RequestHandler): #@tornado.web.authenticated def post(self): print 'LinkerManager post %s' % self.request.uri act = self.get_argument('act') if act == 'get_linkers': return self.get_linkers() def get_linkers(self): data = self.request.body print 'request body is [%s]' % data bee_id = self.get_body_argument('bee', '') status, linkers = data_provider.get_linkers(bee_id) result = {"status": status, "linkers": linkers} self.write(json.dumps(result, ensure_ascii=False)) data_provider = DataProvider() if __name__ == "__main__": cfg = ConfigParser.ConfigParser() cfg.read(sys.argv[1]) data_provider.init(cfg) server = tornado.httpserver.HTTPServer(MyApplication()) server.listen(80) tornado.ioloop.IOLoop.instance().start()