def test_build_schema(self): illegal_col_regex = re.compile(r'\W|[A-Z]') for dataset_name in self.TEST_DATASETS: dataset = Dataset.create(self.test_dataset_ids[dataset_name]) Dataset.build_schema(dataset, self.test_data[dataset_name].dtypes) # get dataset with new schema dataset = Dataset.find_one(self.test_dataset_ids[dataset_name]) for key in [CREATED_AT, SCHEMA, UPDATED_AT]: self.assertTrue(key in dataset.keys()) df_columns = self.test_data[dataset_name].columns.tolist() seen_columns = [] for column_name, column_attributes in dataset[SCHEMA].items(): # check column_name is unique self.assertFalse(column_name in seen_columns) seen_columns.append(column_name) # check column name is only legal chars self.assertFalse(illegal_col_regex.search(column_name)) # check has require attributes self.assertTrue(SIMPLETYPE in column_attributes) self.assertTrue(OLAP_TYPE in column_attributes) self.assertTrue(LABEL in column_attributes) # check label is an original column self.assertTrue(column_attributes[LABEL] in df_columns) df_columns.remove(column_attributes[LABEL]) # ensure all columns in df_columns have store columns self.assertTrue(len(df_columns) == 0)
def get(self): Dataset.deleteAll() Dumpfile.deleteAll() for dataset in yaml.load(open("predefined_datasets.yaml", 'r')): Dataset(name=dataset['name'], voidURI=dataset['voidURI']).put() return webapp2.redirect('/datasets')
def import_dataset(_file, dataset): """ For reading a URL and saving the corresponding dataset. """ dframe = read_csv(_file) Dataset.build_schema(dataset, dframe.dtypes) Observation.save(dframe, dataset)
def setUp(self): TestBase.setUp(self) self.dataset = Dataset.save(self.test_dataset_ids['good_eats.csv']) Dataset.build_schema(self.dataset, self.test_data['good_eats.csv'].dtypes) self.formula = 'rating' self.name = 'test'
def test_update(self): for dataset_name in self.TEST_DATASETS: dataset = Dataset.create(self.test_dataset_ids[dataset_name]) self.assertFalse('field' in dataset) Dataset.update(dataset, {'field': {'key': 'value'}}) dataset = Dataset.find_one(self.test_dataset_ids[dataset_name]) self.assertTrue('field' in dataset) self.assertEqual(dataset['field'], {'key': 'value'})
def test_find(self): for dataset_name in self.TEST_DATASETS: record = Dataset.save(self.test_dataset_ids[dataset_name]) cursor = Dataset.find(self.test_dataset_ids[dataset_name]) rows = [x for x in cursor] self.assertTrue(isinstance(cursor, Cursor)) self.assertEqual(record, rows[0]) self.assertEqual(record, Dataset.find_one( self.test_dataset_ids[dataset_name]))
def test_POST_remove_summary(self): Datasets().GET(self.dataset_id, mode=MODE_SUMMARY) dataset = Dataset.find_one(self.dataset_id) self.assertTrue(isinstance(dataset[STATS], dict)) self.assertTrue(isinstance(dataset[STATS][ALL], dict)) self._post_formula() # [STATS][ALL] should be removed dataset = Dataset.find_one(self.dataset_id) self.assertEqual(dataset[STATS].get(ALL), None)
def test_delete(self): for dataset_name in self.TEST_DATASETS: record = Dataset.save(self.test_dataset_ids[dataset_name]) records = [x for x in \ Dataset.find(self.test_dataset_ids[dataset_name])] self.assertNotEqual(records, []) Dataset.delete(self.test_dataset_ids[dataset_name]) records = [x for x in Dataset.find(self.test_dataset_ids[dataset_name])] self.assertEqual(records, [])
def DELETE(self, dataset_id): """ Delete observations (i.e. the dataset) with hash *dataset_id* from mongo """ dataset = Dataset.find_one(dataset_id) result = None if dataset: Dataset.delete(dataset_id) Observation.delete(dataset) result = {SUCCESS: 'deleted dataset: %s' % dataset_id} return dump_or_error(result, 'id not found')
def update(cls, dframe, dataset): """ Update *dataset* by overwriting all observations with the given *dframe*. """ previous_dtypes = cls.find(dataset, as_df=True).dtypes.to_dict() new_dtypes = dframe.dtypes.to_dict().items() cols_to_add = dict([(name, dtype) for name, dtype in new_dtypes if name not in previous_dtypes]) Dataset.update_schema(dataset, cols_to_add) cls.delete(dataset) cls.save(dframe, dataset) return cls.find(dataset, as_df=True)
def combined_dataset(cls, ids, window_length): dataset = Dataset.empty() for id in ids: session = cls.from_api(id) windows = list(session.window_gen(window_length=window_length)) dataset = dataset + session.dataset(windows) return dataset
def get(self, datasetID): dataset = Dataset.get_by_id(long(datasetID)) crawl = Crawl(dataset=dataset, status='QUEUED') crawl.put() ''' Queue the crawl immediately ''' crawl.queue(5) return webapp2.redirect('/datasets/' + datasetID)
def __init__(self, exchange, period_start: datetime, period_end=None, interval=60, *args, **kwargs): self.exchange = exchange self.interval = interval self.period_start = period_start self.period_end = period_end self.start = datetime.now() self.dataset = Dataset().create( data={'exchange': '/api/exchanges/'+self.exchange.name.lower(), 'periodStart': self.period_start, 'periodEnd': self.period_end, 'candleSize': 60, 'currency': '/api/currencies/'+self.exchange.currency.lower(), 'asset': '/api/currencies/'+self.exchange.asset.lower()})
def get(self, datasetID): dataset = Dataset.get_by_id(long(datasetID)) for crawl in Crawl.all().filter('dataset =', dataset).run(): crawl.delete() for dump in Dumpfile.all().filter('dataset =', dataset).run(): dump.delete() dataset.delete() logging.info('Deleted dataset ' + datasetID) return webapp2.redirect('/datasets')
def dataset(self, windows, remove_seconds=0): if len(windows) == 0: return Dataset.empty() n_samples = len(windows) n_channels = len(self.ch_names) window_length = np.shape(windows)[1] X = np.empty([n_samples, n_channels, window_length]) y = np.empty([n_samples], dtype=np.int8) for i, window in enumerate(windows): X[i] = window[:, 0:n_channels].T y[i] = int(max(window[:, -1])) if remove_seconds > 0: change_points = [] action_labels = [] for i in range(1, len(y), 1): if y[i] != y[i - 1]: change_points.append(i) action_labels.append(np.max(y[i - 1:i + 1])) remove_distance = (250 * remove_seconds) / window_length keep_indices = [] for i in range(len(y)): label = y[i] if label == 0: viable = True for point in change_points: if np.abs(i - point) <= remove_distance: viable = False if viable: keep_indices.append(i) else: keep_indices.append(i) X = X[keep_indices] y = y[keep_indices] return Dataset(X, y, self.person_id, self.id)
def start_dataset_creating(): start = datetime.datetime.now() dataset_name = 'dataset_' + str(time.time()).replace('.', '') dataset_dir = os.path.join(current_app.config['DATASET_DIR'], dataset_name) os.makedirs(dataset_dir, exist_ok=True) dataset = Dataset(name=dataset_name, path=dataset_dir, dt_start=start, status=DatasetStatus.start, type=DatasetType.top_one) db.session.add(dataset) db.session.commit() collector = DatasetCollector(dataset_model=dataset) try: # TODO: добавить параметры датасета collector.create_doctor_item_base_matrix() collector.create_datasets_for_catboost(min_appts=10) except Exception as e: traceback.print_exc() dataset.status = DatasetStatus.fail dataset.error = str(e) else: dataset.status = DatasetStatus.end finally: dataset.dt_end = datetime.datetime.now() db.session.add(dataset) db.session.commit() return redirect(url_for('dataset.main'))
def setUp(self): TestBase.setUp(self) self.dataset = Dataset.save(self.test_dataset_ids['good_eats.csv']) dframe = self.test_data['good_eats.csv'] Dataset.build_schema(self.dataset, dframe.dtypes) Observation.save(dframe, self.dataset) self.calculations = [ 'rating', 'gps', 'amount + gps_alt', 'amount - gps_alt', 'amount + 5', 'amount - gps_alt + 2.5', 'amount * gps_alt', 'amount / gps_alt', 'amount * gps_alt / 2.5', 'amount + gps_alt * gps_precision', '(amount + gps_alt) * gps_precision', 'amount = 2', '10 < amount', '10 < amount + gps_alt', 'not amount = 2', 'not(amount = 2)', 'amount = 2 and 10 < amount', 'amount = 2 or 10 < amount', 'not not amount = 2 or 10 < amount', 'not amount = 2 or 10 < amount', '(not amount = 2) or 10 < amount', 'not(amount = 2 or 10 < amount)', 'amount ^ 3', '(amount + gps_alt) ^ 2 + 100', '-amount', '-amount < gps_alt - 100', 'rating in ["delectible"]', 'risk_factor in ["low_risk"]', 'amount in ["9.0", "2.0", "20.0"]', '(risk_factor in ["low_risk"]) and (amount in ["9.0", "20.0"])', ] self.places = 5
def parse_datasets(res): soup = BeautifulSoup(res.text, 'html.parser') views = soup.findAll('div', {'class': 'views-row'}) datasets = [] for view in views: div = view.find('div', {'class': 'views-field-body'}) if div != -1: el = div.find('a') datasetid = el['href'].replace("/download/content/", "") name = el.text datasets.append(Dataset(datasetid, name)) return datasets
def create_dataset_from_csv(csv_file): """ Create a dataset from the uploaded .csv file. """ dataset_id = uuid.uuid4().hex dataset = Dataset.create(dataset_id) # need to write out to a named tempfile in order # to get a handle in order for pandas read_csv with tempfile.NamedTemporaryFile() as tmpfile: tmpfile.write(read_uploaded_file(csv_file)) import_dataset(tmpfile.name, dataset) return {ID: dataset_id}
def GET(self, dataset_id, mode=False, query='{}', select=None, group=ALL): """ Return data set for hash *dataset_id*. Execute query *query* in mongo if passed. If summary is passed return summary statistics for data set. If group is passed group the summary, if summary is false group is ignored. """ dataset = Dataset.find_one(dataset_id) result = None try: if dataset: if mode == MODE_INFO: result = Dataset.schema(dataset) elif mode == MODE_SUMMARY: result = summarize(dataset, query, select, group) else: return mongo_to_json(Observation.find(dataset, query, select)) except JSONError, e: result = {ERROR: e.__str__()}
def full_dataset_gen(cls, window_length, count=1, sessions=None): if sessions is None: Print.info("Fetching sessions") sessions = Session.fetch_all(only_real=True, include_timeframes=True) for _ in range(count): dataset = Dataset.empty() for session in sessions: windows = list(session.window_gen(window_length=window_length)) dataset = dataset + session.dataset(windows=windows) yield dataset
def sample(self, n): """Sample a batch size n of experience""" if len(self.memory) < n: raise IndexError('Size of memory ({}) is less than requested sample ({})'.format(len(self), n)) else: scores = [x[1] for x in self.memory] sample = np.random.choice(len(self), size=n, replace=False, p=scores / np.sum(scores)) sample = [self.memory[i] for i in sample] smiles = [x[0] for x in sample] scores = [x[1] for x in sample] prior_likelihood = [x[2] for x in sample] tokenized = [self.voc.tokenize(smile) for smile in smiles] encoded = [self.voc.encode(tokenized_i) for tokenized_i in tokenized] encoded = Dataset.collate_fn(encoded) return encoded, np.array(scores), np.array(prior_likelihood)
def summarize(dataset, query, select, group): """ Return a summary for the rows/values filtered by *query* and *select* and grouped by *group* or the overall summary if no group is specified. """ # narrow list of observations via query/select dframe = Observation.find(dataset, query, select, as_df=True) # do not allow group by numeric types # TODO check schema for valid groupby columns once included _type = dframe.dtypes.get(group) if group != ALL and (_type is None or _type.type != np.object_): return {ERROR: "group: '%s' is not categorical." % group} # check cached stats for group and update as necessary stats = dataset.get(STATS, {}) if not stats.get(group): stats = {ALL: summarize_df(dframe)} if group == ALL \ else summarize_with_groups(dframe, stats, group) Dataset.update(dataset, {STATS: stats}) stats_to_return = stats.get(group) return dict_from_mongo(stats_to_return if group == ALL else {group: stats_to_return})
def _test_summary_no_group(self, results): result_keys = results.keys() print result_keys print self.test_data[self._file_name].columns.tolist() self.assertEqual(len(result_keys), self.NUM_COLS) columns = [col for col in self.test_data[self._file_name].columns.tolist() if not col in MONGO_RESERVED_KEYS] dataset = Dataset.find_one(self.dataset_id) labels_to_slugs = build_labels_to_slugs(dataset) for col in columns: slug = labels_to_slugs[col] self.assertTrue(slug in result_keys, 'col (slug): %s in: %s' % (slug, result_keys)) self.assertTrue(SUMMARY in results[slug].keys())
def generate_yaml_from_netCDF(nc_path, product_name, product_description, no_data): # Extract data from netCDF file dataset = xarray.load_dataset(nc_path) measurements = [] dims = [i for i in dataset.sizes.mapping.mapping] for var in dataset: ds_var = dataset[var].variable if "units" in ds_var.attrs.keys(): measurements.append( Measurement( var, ds_var.dtype.name, ds_var.attrs["units"], no_data, Path(nc_path).name, )) # Classes generation dataset = Dataset( product_name, dataset.longitude.data, dataset.latitude.data, measurements, ) product = Product( product_name, product_description, measurements=measurements, storage_driver="NetCDF CF", storage_dimension_order=dims, ) # YAML configuration yaml.emitter.Emitter.process_tag = lambda self, *args, **kw: None CWD = os.path.dirname(__file__) # Product generation with open(os.path.join(CWD, "./tests/product_generated.yaml"), "w") as f: yaml.dump(product, f, sort_keys=False) # Dataset generation data = yaml.dump(dataset, sort_keys=False) data = data.replace("'%", "") data = data.replace("%'", "") with open(os.path.join(CWD, "./tests/dataset_generated.yaml"), "w") as f: f.write(data)
def get(self, datasetID): startIn = self.request.get('start').split(':') if len(startIn) == 2: logging.info('Queuing harvest in ' + startIn[0] + ' hours ' + startIn[1] + ' minutes') seconds = int(startIn[0]) * 3600 + int(startIn[1]) * 60 dataset = Dataset.get_by_id(long(datasetID)) ''' TODO store 'interval' param in dataset object (if any) ''' crawl = Crawl(dataset=dataset, status='QUEUED') crawl.put() crawl.queue(seconds) return webapp2.redirect('/datasets/' + datasetID) else: ''' TODO decent error handling ''' logging.info('Invalid crawl time: ' + self.request.get('start')) return webapp2.redirect('/datasets/' + datasetID + '?error=true')
def initiate_from_file(self, fname, scoring_function, Prior): """Adds experience from a file with SMILES Needs a scoring function and an RNN to score the sequences""" with open(fname, 'r') as f: smiles = [] for line in f: smile = line.split()[0] if Chem.MolFromSmiles(smile): smile = Chem.MolToSmiles(Chem.MolFromSmiles(smile), isomericSmiles=False) smiles.append(smile) scores = scoring_function(smiles) tokenized = [self.voc.tokenize(smile) for smile in smiles] encoded = [self.voc.encode(tokenized_i) for tokenized_i in tokenized] encoded = Dataset.collate_fn(encoded) prior_likelihood, _ = Prior.likelihood(encoded.long()) prior_likelihood = prior_likelihood.data.cpu().numpy() new_experience = zip(smiles, scores, prior_likelihood) self.add_experience(new_experience)
def create_dataset_from_url(url, allow_local_file=False): """ Load a URL, read from a CSV, create a dataset and return the unique ID. """ _file = None try: _file = open_data_file(url, allow_local_file) except (IOError, urllib2.HTTPError): # error reading file/url, return pass if not _file: # could not get a file handle return {ERROR: 'could not get a filehandle for: %s' % url} dataset_id = uuid.uuid4().hex dataset = Dataset.create(dataset_id) import_dataset(_file, dataset) return {ID: dataset_id}
def __init__(self, exchange: Exchange, timeout=60, *args, **kwargs): super().__init__(exchange, timeout, *args, **kwargs) self.buy_price = 0 self.sell_price = 0 self.stop_loss = 0 self.market_delta = 0 self.advised = False self.waiting_order = False self.fulfilled_orders = [] self.last_price = 0 # create a dataset for the session self.dataset = Dataset().create( data={ 'exchange': self.exchange.name.lower(), 'periodStart': datetime.now(), 'candleSize': 60, 'currency': self.exchange.currency, 'asset': self.exchange.asset })
def __init__(self, exchange: Exchange, period_start: datetime, period_end=None, interval=60): self.launchedAt = datetime.now() # Try to find dataset dataset = Dataset().get({ "exchange": exchange.name.lower(), "currency": exchange.currency.lower(), "asset": exchange.asset.lower(), "periodStart": period_start, "periodEnd": period_end, "candleSize": interval }) if dataset and len(dataset) > 0: print(dataset) print(dataset[0]) print("Dataset found: " + dataset[0]['uuid']) price = Price() for prices in price.query('get', {"dataset": dataset[0]['uuid']}): for price in prices: print(price) newPrice = Price() newPrice.populate(price) exchange.strategy.set_price(newPrice) exchange.strategy.run() else: print("Dataset not found, external API call to " + exchange.name) for price in exchange.historical_symbol_ticker_candle( period_start, period_end, interval): exchange.strategy.set_price(price) exchange.strategy.run() execution_time = datetime.now() - self.launchedAt print('Execution time: ' + str(execution_time.total_seconds()) + ' seconds') sys.exit(0)
config = yaml.safe_load(f) # load logger lc = config['environment']['log_config'] logging.config.fileConfig(lc) logs = logging.getLogger() # load device config cuda = config['environment']['cuda'] os.environ["CUDA_VISIBLE_DEVICES"] = "0" # load dataloader it = config['test']['image_root'] bs = 1 iz = None data = Dataset(it, iz, cuda) loader = DataLoader(data, bs, cuda) # load color transform network net_col = col.Generator(2) net_col = nn.DataParallel(net_col) net_col = net_col.cuda() if cuda else net_col # load temporal constraint network net_tem = tem.Generator(64) net_tem = nn.DataParallel(net_tem) net_tem = net_tem.cuda() if cuda else net_tem # load pretrained models # col_gen.load_state_dict(torch.load(test['load_pretrain_model'][0], map_location='cpu')) # tem_gen.load_state_dict(torch.load(test['load_pretrain_model'][1], map_location='cpu'))
elif mode == 'live': exchange.start_symbol_ticker_socket(exchange.get_symbol()) elif mode == 'backtest': period_start = config('PERIOD_START') period_end = config('PERIOD_END') print("Backtest period from {} to {} with {} seconds candlesticks.".format( period_start, period_end, interval)) # Try to find dataset dataset = Dataset().query( 'get', { "exchange": '/api/exchanges/' + exchange.name.lower(), "currency": '/api/currencies/' + currency.lower(), "asset": '/api/currencies/' + asset.lower(), "period_start": period_start, "period_end": period_end, "candleSize": interval }) if dataset and len(dataset) > 0: print(dataset[0]) price = Price() for price in price.query('get', {"dataset": dataset[0]['uuid']}): newPrice = Price() newPrice.populate(price) exchange.strategy.set_price(newPrice) exchange.strategy.run() else: print("Dataset not found, external API call to " + exchange.name)
def get(self): datasets = Dataset.all() self.render_response('datasets/datasets_listall.html', **{'datasets':datasets})
def post(self): dataset = Dataset(name=self.request.get('name'), voidURI=self.request.get('voidURI')) dataset.put() return webapp2.redirect('/datasets')
except ParseError, err: # do not save record, return error return {ERROR: err} record = { DATASET_ID: dataset[DATASET_ID], cls.FORMULA: formula, cls.NAME: name, } cls.collection.insert(record) # invalidate summary ALL since we have a new column stats = dataset.get(STATS) if stats: del stats[ALL] del dataset[STATS] Dataset.update(dataset, {STATS: stats}) # call remote calculate and pass calculation id calculate_column.delay(dataset, dframe, formula, name) return mongo_remove_reserved_keys(record) @classmethod def find(cls, dataset): """ Return the calculations for given *dataset*. """ return [mongo_remove_reserved_keys(record) for record in cls.collection.find({ DATASET_ID: dataset[DATASET_ID], })]
def test_save(self): for dataset_name in self.TEST_DATASETS: record = Dataset.save(self.test_dataset_ids[dataset_name]) self.assertTrue(isinstance(record, dict)) self.assertTrue('_id' in record.keys())
def get(self, datasetID): dataset = Dataset.get_by_id(long(datasetID)) self.response.write(json.dumps(dataset.toJSON()))
def get(self, datasetID): dataset = Dataset.get_by_id(long(datasetID)) self.render_response('datasets/datasets_show.html', **{'dataset':dataset})
def test_create(self): for dataset_name in self.TEST_DATASETS: dataset = Dataset.create(self.test_dataset_ids[dataset_name]) self.assertTrue(isinstance(dataset, dict))
from models.dataset import Dataset # Creating a dataset of dimension 2 in input and 3 in output dset = Dataset(2, 3) # Adding datapoints dset.add_xy([0.0, 1.0], [ 1.0, 2.0, 0.0]) dset.add_xy([1.0, 0.0], [ 0.0, 0.0, 2.0]) dset.add_xy([2.0,-1.0], [-1.0,-2.0, 4.0]) # Nearest neighbors queries on input, requesting 2 neighbors dset.nn_x([0.2, 0.5], 2) # Nearest neighbors queries on output, requesting 1 neighbors dist, index = dset.nn_y([1.0, 1.0, 1.0], 1) # Retrieving the nearest output of [1.0, 1.0, 1.0] print dset.get_y(index[0]) # Retrieving the nearest datapoint print dset.get_xy(index[0])
def _test_calculator(self, delay=True): dframe = Observation.find(self.dataset, as_df=True) columns = dframe.columns.tolist() start_num_cols = len(columns) added_num_cols = 0 column_labels_to_slugs = build_labels_to_slugs(self.dataset) label_list, slugified_key_list = [list(ary) for ary in zip(*column_labels_to_slugs.items())] for idx, formula in enumerate(self.calculations): name = 'test-%s' % idx if delay: task = calculate_column.delay(self.dataset, dframe, formula, name) # test that task has completed self.assertTrue(task.ready()) self.assertTrue(task.successful()) else: task = calculate_column(self.dataset, dframe, formula, name) column_labels_to_slugs = build_labels_to_slugs(self.dataset) unslug_name = name name = column_labels_to_slugs[unslug_name] # test that updated dataframe persisted dframe = Observation.find(self.dataset, as_df=True) self.assertTrue(name in dframe.columns) # test new number of columns added_num_cols += 1 self.assertEqual(start_num_cols + added_num_cols, len(dframe.columns.tolist())) # test that the schema is up to date dataset = Dataset.find_one(self.dataset[DATASET_ID]) self.assertTrue(SCHEMA in dataset.keys()) self.assertTrue(isinstance(dataset[SCHEMA], dict)) schema = dataset[SCHEMA] # test slugified column names slugified_key_list.append(name) self.assertEqual(sorted(schema.keys()), sorted(slugified_key_list)) # test column labels label_list.append(unslug_name) labels = [schema[col][LABEL] for col in schema.keys()] self.assertEqual(sorted(labels), sorted(label_list)) # test result of calculation formula = column_labels_to_slugs[formula] for idx, row in dframe.iterrows(): try: result = np.float64(row[name]) stored = np.float64(row[formula]) # np.nan != np.nan, continue if we have two nan values if np.isnan(result) and np.isnan(stored): continue msg = self._equal_msg(result, stored, formula) self.assertAlmostEqual(result, stored, self.places, msg) except ValueError: msg = self._equal_msg(row[name], row[formula], formula) self.assertEqual(row[name], row[formula], msg)
def setUp(self): TestBase.setUp(self) self.dataset = Dataset.save(self.test_dataset_ids['good_eats.csv']) Dataset.build_schema(self.dataset, self.test_data['good_eats.csv'].dtypes)