def dump_real_series(config): series = series_io.load_from_df_chunks(config.raw_input, config.col_name) norm_series = series_transform.normalize_series(config, series) delta_series = series_transform.normal_to_delta_series(config, norm_series) dump_path = series_io.dump_as_np_series(config, tmp_stage_name(config), delta_series) logger.info('%r=%r / %r', config.raw_input, dump_path, delta_series)
def dump_as_plain_txt(config): series = series_io.load_from_df_chunks(config.raw_input, config.col_name) dump_path = series_io.dump_as_plain_txt(config, tmp_stage_name(config), series) logger.info('raw series=%r', dump_path) series = series_transform.normalize_series(config, series) dump_path = series_io.dump_as_plain_txt(config, tmp_stage_name(config), series) logger.info('normal series=%r', dump_path) series = series_transform.normal_to_delta_series(config, series) dump_path = series_io.dump_as_plain_txt(config, tmp_stage_name(config), series) logger.info('delta series=%r', dump_path)
def calculate_statistics(config, prefix): infile = prefix + '.gz' series_out, stats_out = tmp_stage_name(config, prefix + '.bin'), tmp_stage_name(config, prefix + '.prob') series = series_io.load_from_df_chunks(infile, config.col_name) series = series_transform.normalize_series(config, series) series = series_transform.normal_to_delta_series(config, series) stats = series_stats_calc.calc_stats_from_delta_series(config, series) logger.info('stats=%r', stats) series_io.dump_as_np_series(config, series_out, series) series_io.dump_prob_distribution(config, stats_out, stats) logger.info('%s bin=%r, stats=%r', infile, series_out, stats_out)
def calculate_statistics(config, prefix): infile = prefix + '.gz' series_out, stats_out = tmp_stage_name(config, prefix + '.bin'), tmp_stage_name( config, prefix + '.prob') series = series_io.load_from_df_chunks(infile, config.col_name) series = series_transform.normalize_series(config, series) series = series_transform.normal_to_delta_series(config, series) stats = series_stats_calc.calc_stats_from_delta_series(config, series) logger.info('stats=%r', stats) series_io.dump_as_np_series(config, series_out, series) series_io.dump_prob_distribution(config, stats_out, stats) logger.info('%s bin=%r, stats=%r', infile, series_out, stats_out)
def test_load_from_np_series_idempotent(self): first_read = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL) dump_path = series_io.dump_as_np_series( TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), first_read) second_read = series_io.load_from_np_series(TestSeriesIO.config, dump_path) for data1,data2 in zip(first_read.data,second_read.data): assert np.ma.allequal(data1, data2), "%r != %r" % (data1, data2) for meta1,meta2 in zip(first_read.meta,second_read.meta): assert meta1.sid == meta2.sid assert meta1.min == meta2.min assert meta1.max == meta2.max assert meta1.start == meta2.start assert meta1.count == meta2.count
def test_stat_calc_on_sample(self): max_prob = 2 ** self.config.int_len - 1 series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL) series = series_transform.normalize_series(self.config, series) series = series_transform.normal_to_delta_series(self.config, series) stats = series_stats_calc.calc_stats_from_delta_series(self.config, series) #logger.info("result=\n%r", stats.prob_dstrb) self.assertTrue(stats.prob_dstrb[0][1] < max_prob) self.assertEqual(stats.prob_dstrb[0][2], stats.prob_dstrb[0][1]) self.assertEqual(stats.prob_dstrb[-1][1], max_prob) self.assertTrue(all( cum > 0 and cum <= max_prob for sym,cum,dlt in stats.prob_dstrb )) self.assertTrue(all( dlt > 0 and dlt <= max_prob and dlt <= cum for sym,cum,dlt in stats.prob_dstrb )) self.assertTrue(self.percentiles_ok(stats.full_histo.perc)) self.assertTrue(self.percentiles_ok(stats.norm_histo.perc))
def test_normalize_series(self): series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL) norm_series = series_transform.normalize_series(self.config, series) lerp_max = series_transform.get_lerp_max(self.config) for meta1,meta2 in zip(series.meta, norm_series.meta): self.assertEqual(meta1.sid, meta2.sid) self.assertEqual(meta1.count, meta2.count) self.assertEqual(meta1.start, meta2.start) for meta2,data2 in zip(norm_series.meta, norm_series.data): vmin, vmax = data2.min(), data2.max() self.assertTrue(np.any(data2 == 0)) self.assertTrue(meta2.max - meta2.min >= MIN_SCALE) self.assertTrue(vmax - vmin >= 0, "%r < %r" % (vmin,vmax)) self.assertTrue(np.all(data2 <= lerp_max), '%r = [%d, %d]' % (meta2, vmin, vmax))
def test_dump_as_plain_txt(self): series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL) dump_path = series_io.dump_as_plain_txt( TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), series) assert os.path.isfile(dump_path) with my_open(TestSeriesIO.config, dump_path, 'r') as fileobj: lines = fileobj.readlines() expected_lines = sum(len(d) + 1 for d in series.data) self.assertGreaterEqual(len(lines), expected_lines) series = series_transform.normalize_series(self.config, series) dump_path = series_io.dump_as_plain_txt( TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), series) series = series_transform.normal_to_delta_series(self.config, series) dump_path = series_io.dump_as_plain_txt( TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), series)
def test_normalize_series(self): series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL) norm_series = series_transform.normalize_series(self.config, series) lerp_max = series_transform.get_lerp_max(self.config) for meta1, meta2 in zip(series.meta, norm_series.meta): self.assertEqual(meta1.sid, meta2.sid) self.assertEqual(meta1.count, meta2.count) self.assertEqual(meta1.start, meta2.start) for meta2, data2 in zip(norm_series.meta, norm_series.data): vmin, vmax = data2.min(), data2.max() self.assertTrue(np.any(data2 == 0)) self.assertTrue(meta2.max - meta2.min >= MIN_SCALE) self.assertTrue(vmax - vmin >= 0, "%r < %r" % (vmin, vmax)) self.assertTrue(np.all(data2 <= lerp_max), '%r = [%d, %d]' % (meta2, vmin, vmax))
def test_dump_as_plain_txt(self): series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL) dump_path = series_io.dump_as_plain_txt( TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), series) assert os.path.isfile(dump_path) with my_open(TestSeriesIO.config, dump_path, 'r') as fileobj: lines = fileobj.readlines() expected_lines = sum( len(d) + 1 for d in series.data ) self.assertGreaterEqual(len(lines), expected_lines) series = series_transform.normalize_series(self.config, series) dump_path = series_io.dump_as_plain_txt( TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), series) series = series_transform.normal_to_delta_series(self.config, series) dump_path = series_io.dump_as_plain_txt( TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), series)
def test_load_from_np_series_idempotent(self): first_read = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL) dump_path = series_io.dump_as_np_series( TestSeriesIO.config, tmp_stage_name(TestSeriesIO.config), first_read) second_read = series_io.load_from_np_series(TestSeriesIO.config, dump_path) for data1, data2 in zip(first_read.data, second_read.data): assert np.ma.allequal(data1, data2), "%r != %r" % (data1, data2) for meta1, meta2 in zip(first_read.meta, second_read.meta): assert meta1.sid == meta2.sid assert meta1.min == meta2.min assert meta1.max == meta2.max assert meta1.start == meta2.start assert meta1.count == meta2.count
def test_cycle_transformations(self): series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL) norm_series = series_transform.normalize_series(self.config, series) delta_series = series_transform.normal_to_delta_series(self.config, norm_series) raw_series = series_transform.delta_to_raw_series(self.config, delta_series) atol = MIN_SCALE / 2 ** (self.config.int_len-1) #logger.debug('start : %r\nend : %r', series.data[2], raw_series.data[2]) for meta1,meta2 in zip(series.meta, raw_series.meta): self.assertEqual(meta1.sid, meta2.sid) self.assertEqual(meta1.count, meta2.count) self.assertEqual(meta1.start, meta2.start) for data1,data2 in zip(series.data, raw_series.data): outliers = ((data1 - data2) / data2).compressed() outliers = outliers[np.logical_or(outliers<-1e-3, outliers>1e-3)] self.assertTrue(np.array_equal(data1.mask, data2.mask)) self.assertTrue(np.allclose(data1, data2, atol=atol, rtol=1e-3), '\n%r' % outliers)
def test_stat_calc_on_sample(self): max_prob = 2**self.config.int_len - 1 series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL) series = series_transform.normalize_series(self.config, series) series = series_transform.normal_to_delta_series(self.config, series) stats = series_stats_calc.calc_stats_from_delta_series( self.config, series) #logger.info("result=\n%r", stats.prob_dstrb) self.assertTrue(stats.prob_dstrb[0][1] < max_prob) self.assertEqual(stats.prob_dstrb[0][2], stats.prob_dstrb[0][1]) self.assertEqual(stats.prob_dstrb[-1][1], max_prob) self.assertTrue( all(cum > 0 and cum <= max_prob for sym, cum, dlt in stats.prob_dstrb)) self.assertTrue( all(dlt > 0 and dlt <= max_prob and dlt <= cum for sym, cum, dlt in stats.prob_dstrb)) self.assertTrue(self.percentiles_ok(stats.full_histo.perc)) self.assertTrue(self.percentiles_ok(stats.norm_histo.perc))
def test_cycle_transformations(self): series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL) norm_series = series_transform.normalize_series(self.config, series) delta_series = series_transform.normal_to_delta_series( self.config, norm_series) raw_series = series_transform.delta_to_raw_series( self.config, delta_series) atol = MIN_SCALE / 2**(self.config.int_len - 1) #logger.debug('start : %r\nend : %r', series.data[2], raw_series.data[2]) for meta1, meta2 in zip(series.meta, raw_series.meta): self.assertEqual(meta1.sid, meta2.sid) self.assertEqual(meta1.count, meta2.count) self.assertEqual(meta1.start, meta2.start) for data1, data2 in zip(series.data, raw_series.data): outliers = ((data1 - data2) / data2).compressed() outliers = outliers[np.logical_or(outliers < -1e-3, outliers > 1e-3)] self.assertTrue(np.array_equal(data1.mask, data2.mask)) self.assertTrue(np.allclose(data1, data2, atol=atol, rtol=1e-3), '\n%r' % outliers)
def test_load_from_df_chunks_basic(self): series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL) self.assertEqual(len(series.meta), sum( s for s in TDATA_SAMPLES )) for meta,data in zip(series.meta, series.data): assert len(data) == meta.count
def test_load_from_df_chunks_basic(self): series = series_io.load_from_df_chunks(CHIST_RAW_FILES[0], CHIST_COL) self.assertEqual(len(series.meta), sum(s for s in TDATA_SAMPLES)) for meta, data in zip(series.meta, series.data): assert len(data) == meta.count